From 47c43a3ce997ac79a49bb3f8d4e55e1b4521a4e4 Mon Sep 17 00:00:00 2001 From: Khauneesh-AI Date: Thu, 17 Apr 2025 20:55:44 +0530 Subject: [PATCH 01/10] endpoint for example content and worklow parameter set for freeform --- app/main.py | 34 ++++++++++++++++++++++++++++++++++ app/models/request_models.py | 1 + 2 files changed, 35 insertions(+) diff --git a/app/main.py b/app/main.py index ec8b7e9..fceed55 100644 --- a/app/main.py +++ b/app/main.py @@ -363,6 +363,40 @@ async def get_dataset_size(request:JsonDataSize): return {"dataset_size": len(inputs)} +@app.post("/json/get_content", include_in_schema=True, responses = responses, + description = "get total dataset size for jsons") +async def get_dataset_size(request: RelativePath): + + if request.path: + path = request.path + try: + with open(path) as f: + data = json.load(f) + + + except json.JSONDecodeError as e: + error_msg = f"Invalid JSON format in file {path}: {str(e)}" + print(error_msg) + return JSONResponse( + status_code=400, + content={"status": "failed", "error": error_msg} + ) + except (KeyError, ValueError) as e: + print(str(e)) + return JSONResponse( + status_code=400, + content={"status": "failed", "error": str(e)} + ) + except Exception as e: + error_msg = f"Error processing {path}: {str(e)}" + print(error_msg) + return JSONResponse( + status_code=400, + content={"status": "failed", "error": error_msg} + ) + + return {"data": data} + @app.post("/synthesis/generate", include_in_schema=True, responses=responses, diff --git a/app/models/request_models.py b/app/models/request_models.py index 08252a3..4b8926a 100644 --- a/app/models/request_models.py +++ b/app/models/request_models.py @@ -13,6 +13,7 @@ class Technique(str, Enum): SFT = "sft" Custom_Workflow = "custom_workflow" Model_Alignment = "model_alignment" + Freeform = "freeform" class Example(BaseModel): From 487678db124242152b404945c6922a9899a76997 Mon Sep 17 00:00:00 2001 From: Khauneesh-AI Date: Thu, 17 Apr 2025 20:59:53 +0530 Subject: [PATCH 02/10] endpoint for example content and worklow parameter set for freeform --- app/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/main.py b/app/main.py index fceed55..e4b9b5d 100644 --- a/app/main.py +++ b/app/main.py @@ -368,7 +368,8 @@ async def get_dataset_size(request:JsonDataSize): async def get_dataset_size(request: RelativePath): if request.path: - path = request.path + path = path_manager.get_str_path(request.path) + try: with open(path) as f: data = json.load(f) From e4d7aeae395f04a76f66c3f1740ff2e016bdbb37 Mon Sep 17 00:00:00 2001 From: Khauneesh-AI Date: Thu, 17 Apr 2025 21:01:08 +0530 Subject: [PATCH 03/10] endpoint for example content and worklow parameter set for freeform --- app/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/main.py b/app/main.py index e4b9b5d..c8974a2 100644 --- a/app/main.py +++ b/app/main.py @@ -364,7 +364,7 @@ async def get_dataset_size(request:JsonDataSize): return {"dataset_size": len(inputs)} @app.post("/json/get_content", include_in_schema=True, responses = responses, - description = "get total dataset size for jsons") + description = "get json content") async def get_dataset_size(request: RelativePath): if request.path: From e87fa0c098d8003c868c89b18fc45f2001d4ca99 Mon Sep 17 00:00:00 2001 From: Khauneesh-AI Date: Tue, 22 Apr 2025 23:22:25 +0530 Subject: [PATCH 04/10] Add S3 export functionality with database support --- .../1a8fdc23eb6f_add_s3_export_path.py | 38 +++++++ app/core/database.py | 48 +++++---- app/migrations/alembic_schema_models.py | 2 + app/models/request_models.py | 43 ++++---- app/run_export_job.py | 18 ---- app/services/export_results.py | 74 ++++++++++---- app/services/s3_export.py | 98 +++++++++++++++++++ app/services/synthesis_job.py | 47 +++++++-- 8 files changed, 282 insertions(+), 86 deletions(-) create mode 100644 alembic/versions/1a8fdc23eb6f_add_s3_export_path.py create mode 100644 app/services/s3_export.py diff --git a/alembic/versions/1a8fdc23eb6f_add_s3_export_path.py b/alembic/versions/1a8fdc23eb6f_add_s3_export_path.py new file mode 100644 index 0000000..0497be5 --- /dev/null +++ b/alembic/versions/1a8fdc23eb6f_add_s3_export_path.py @@ -0,0 +1,38 @@ +"""add_s3_export_path + +Revision ID: 1a8fdc23eb6f +Revises: 9023b46c8d4c +Create Date: 2025-04-22 20:01:13.247491 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '1a8fdc23eb6f' +down_revision: Union[str, None] = '9023b46c8d4c' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Add s3_export_path column to generation_metadata table + with op.batch_alter_table('generation_metadata', schema=None) as batch_op: + batch_op.add_column(sa.Column('s3_export_path', sa.Text(), nullable=True)) + + # Add s3_export_path column to export_metadata table + with op.batch_alter_table('export_metadata', schema=None) as batch_op: + batch_op.add_column(sa.Column('s3_export_path', sa.Text(), nullable=True)) + + +def downgrade() -> None: + # Remove s3_export_path column from generation_metadata table + with op.batch_alter_table('generation_metadata', schema=None) as batch_op: + batch_op.drop_column('s3_export_path') + + # Remove s3_export_path column from export_metadata table + with op.batch_alter_table('export_metadata', schema=None) as batch_op: + batch_op.drop_column('s3_export_path') diff --git a/app/core/database.py b/app/core/database.py index 3e9ee2b..8a94288 100644 --- a/app/core/database.py +++ b/app/core/database.py @@ -60,6 +60,7 @@ def init_db(self): display_name TEXT, local_export_path TEXT, hf_export_path TEXT, + s3_export_path TEXT, num_questions FLOAT, total_count FLOAT, topics TEXT, @@ -107,6 +108,7 @@ def init_db(self): display_name TEXT, local_export_path TEXT, hf_export_path TEXT, + s3_export_path TEXT, job_id TEXT, job_name TEXT UNIQUE, job_status TEXT, @@ -145,29 +147,24 @@ def save_generation_metadata(self, metadata: Dict) -> int: try: # Prepare data outside transaction if metadata.get('generate_file_name'): - output_paths = metadata.get('output_path', {}) else: - output_paths = {} - - # Use a single connection with enhanced settings with self.get_connection() as conn: conn.execute("BEGIN IMMEDIATE") - cursor = conn.cursor() query = """ INSERT INTO generation_metadata ( - timestamp, technique, model_id, inference_type, caii_endpoint, use_case, - custom_prompt, model_parameters, input_key, output_key, output_value, generate_file_name, - display_name, local_export_path, hf_export_path, - num_questions, total_count, topics, examples, - schema, doc_paths, input_path, job_id, job_name, job_status, job_creator_name - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + timestamp, technique, model_id, inference_type, caii_endpoint, use_case, + custom_prompt, model_parameters, input_key, output_key, output_value, generate_file_name, + display_name, local_export_path, hf_export_path, s3_export_path, + num_questions, total_count, topics, examples, + schema, doc_paths, input_path, job_id, job_name, job_status, job_creator_name + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """ values = ( @@ -186,6 +183,7 @@ def save_generation_metadata(self, metadata: Dict) -> int: metadata.get('display_name', None), output_paths.get('local', None), output_paths.get('huggingface', None), + output_paths.get('s3', None), metadata.get('num_questions', None), metadata.get('total_count', None), metadata.get('topics', None), @@ -198,20 +196,18 @@ def save_generation_metadata(self, metadata: Dict) -> int: metadata.get('job_status', None), metadata.get('job_creator_name', None) ) - #print(values) cursor.execute(query, values) conn.commit() return cursor.lastrowid except sqlite3.OperationalError as e: - if conn: + if 'conn' in locals(): conn.rollback() print(f"Database operation error in save_generation_metadata: {e}") - raise except Exception as e: - if conn: + if 'conn' in locals(): conn.rollback() print(f"Error saving metadata to database: {str(e)}") raise @@ -359,7 +355,6 @@ def save_evaluation_metadata(self, metadata: Dict) -> int: def save_export_metadata(self, metadata: Dict) -> int: """Save export metadata to database with prepared transaction""" try: - # Use a single connection with enhanced settings with self.get_connection() as conn: conn.execute("BEGIN IMMEDIATE") @@ -373,11 +368,12 @@ def save_export_metadata(self, metadata: Dict) -> int: display_name, local_export_path, hf_export_path, + s3_export_path, job_id, job_name, job_status, job_creator_name - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """ values = ( @@ -386,6 +382,7 @@ def save_export_metadata(self, metadata: Dict) -> int: metadata.get('display_name'), metadata.get('local_export_path', None), metadata.get('hf_export_path', None), + metadata.get('s3_export_path', None), # Add this line metadata.get('job_id', None), metadata.get('job_name', None), metadata.get('job_status', None), @@ -1131,4 +1128,21 @@ def backup_and_restore_db(self, force_restore: bool = False) -> bool: print(f"Force restore failed: {str(restore_error)}") return False + def update_s3_path(self, file_name: str, s3_path: str): + """Update s3_export_path for a generation""" + try: + with self.get_connection() as conn: + conn.execute("BEGIN IMMEDIATE") + cursor = conn.cursor() + + # Update the s3_path + cursor.execute( + "UPDATE generation_metadata SET s3_export_path = ? WHERE generate_file_name = ?", + (s3_path, file_name) + ) + conn.commit() + print(f"S3 path update successful for file: {file_name}") + except Exception as e: + print(f"Error updating S3 export path: {str(e)}") + raise diff --git a/app/migrations/alembic_schema_models.py b/app/migrations/alembic_schema_models.py index 4035467..3967a11 100644 --- a/app/migrations/alembic_schema_models.py +++ b/app/migrations/alembic_schema_models.py @@ -23,6 +23,7 @@ class GenerationMetadataModel(Base): display_name = Column(Text) local_export_path = Column(Text) hf_export_path = Column(Text) + s3_export_path = Column(Text) num_questions = Column(Float) total_count = Column(Float) topics = Column(Text) @@ -66,6 +67,7 @@ class ExportMetadataModel(Base): display_name = Column(Text) local_export_path = Column(Text) hf_export_path = Column(Text) + s3_export_path = Column(Text) job_id = Column(Text) job_name = Column(Text, unique=True) job_status = Column(Text) diff --git a/app/models/request_models.py b/app/models/request_models.py index 4b8926a..951c060 100644 --- a/app/models/request_models.py +++ b/app/models/request_models.py @@ -45,10 +45,12 @@ class Example_eval(BaseModel): ) +# In app/models/request_models.py class S3Config(BaseModel): """S3 export configuration""" bucket: str - key: str + key: str = "" # Make key optional with default empty string + create_if_not_exists: bool = True # Flag to create bucket if it doesn't exist class HFConfig(BaseModel): """HF export configuration""" @@ -59,41 +61,40 @@ class HFConfig(BaseModel): hf_commit_message: Optional[str] = "Hugging face export" # Commit message class Export_synth(BaseModel): - # Export configuration - export_type: List[str] = Field(default_factory=lambda: ["huggingface"]) # Accept multiple export types (e.g., ["s3", "huggingface"]) - file_path:str - display_name:Optional[str]= None + # Existing fields... + export_type: List[str] = Field(default_factory=lambda: ["huggingface"]) + file_path: str + display_name: Optional[str] = None output_key: Optional[str] = 'Prompt' output_value: Optional[str] = 'Completion' - # Hugging Face-specific fields - hf_config:HFConfig + hf_config: Optional[HFConfig] = None # Make HF config optional # Optional s3 config s3_config: Optional[S3Config] = None - model_config = ConfigDict(protected_namespaces=(), + model_config = ConfigDict( + protected_namespaces=(), json_schema_extra={ "example": { - "export_type": [ - "huggingface" - ], - "file_path": "qa_pairs_claude_20241204_132411_test.json", - "hf_config": { - "hf_token": "your token", - "hf_username": "your_username", - "hf_repo_name": "file_name", - "hf_commit_message": "dataset trial" - } - - + "export_type": ["huggingface", "s3"], + "file_path": "qa_pairs_claude_20241204_132411_test.json", + "hf_config": { + "hf_token": "your token", + "hf_username": "your_username", + "hf_repo_name": "file_name", + "hf_commit_message": "dataset trial" + }, + "s3_config": { + "bucket": "my-dataset-bucket", + "create_if_not_exists": True + } } } ) - class ModelParameters(BaseModel): """Low-level model parameters""" temperature: float = Field(default=0.0, ge=0.0, le=2.0, description="Controls randomness (0.0 to 1.0)") diff --git a/app/run_export_job.py b/app/run_export_job.py index 067babe..99fb48d 100644 --- a/app/run_export_job.py +++ b/app/run_export_job.py @@ -8,24 +8,6 @@ os.chdir("/home/cdsw/synthetic-data-studio") -# def check_and_install_requirements(): -# """Check and install requirements from requirements.txt""" -# # Get the current working directory instead of using __file__ -# current_dir = os.getcwd() -# requirements_path = os.path.join(current_dir, 'requirements.txt') - -# if os.path.exists(requirements_path): -# try: -# print(f"Installing requirements from: {requirements_path}") -# subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-r', requirements_path]) -# except subprocess.CalledProcessError as e: -# print(f"Error installing requirements: {e}") -# sys.exit(1) -# else: -# print("No requirements.txt found, continuing with existing packages") - -# # Run installation check at start -# check_and_install_requirements() # Get the current notebook's directory notebook_dir = os.getcwd() diff --git a/app/services/export_results.py b/app/services/export_results.py index e233e77..89205ba 100644 --- a/app/services/export_results.py +++ b/app/services/export_results.py @@ -15,6 +15,7 @@ from app.models.request_models import Export_synth from app.core.database import DatabaseManager +from app.services.s3_export import export_to_s3 import logging from logging.handlers import RotatingFileHandler @@ -101,30 +102,62 @@ def _create_dataset(self, records:List, output_key, output_value, file_path) -> return dataset - def export(self,request:Export_synth): + def export(self, request: Export_synth): try: export_paths = {} file_name = os.path.basename(request.file_path) - try: - with open(request.file_path, 'r') as f: - output_data = json.load(f) - except FileNotFoundError: - raise HTTPException(status_code=404, detail=f"File not found: {request.file_path}") - except json.JSONDecodeError as e: - raise HTTPException(status_code=400, detail=f"Invalid JSON file: {str(e)}") for export_type in request.export_type: - if export_type == "s3" and request.s3_config: - s3_client = boto3.client("s3") - s3_client.put_object( - Bucket=request.s3_config.bucket, - Key=request.s3_config.key, - Body=json.dumps(output_data, indent=2), - ) - export_paths['s3']= f"s3://{request.s3_config.bucket}/{request.s3_config.key}" - self.logger.info(f"Results saved to S3: {export_paths['s3']}") - + # S3 Export + if export_type == "s3": + if not request.s3_config: + raise HTTPException(status_code=400, detail="S3 configuration required for S3 export") + + try: + # Get bucket and key from request + bucket_name = request.s3_config.bucket + key = request.s3_config.key or file_name + + # Override with display_name if provided + if request.display_name and not request.s3_config.key: + key = f"{request.display_name}.json" + + + + + create_bucket = getattr(request.s3_config, 'create_if_not_exists', True) + + s3_result = export_to_s3( + file_path=request.file_path, + bucket_name=bucket_name, + key=key, + create_bucket=create_bucket + ) + + s3_path = s3_result['s3'] + self.logger.info(f"Results saved to S3: {s3_path}") + + # Update database with S3 path + self.db.update_s3_path(file_name, s3_path) + self.logger.info(f"Generation Metadata updated for s3_path: {s3_path}") + + export_paths['s3'] = s3_path + + except Exception as e: + self.logger.error(f"Error exporting to S3: {str(e)}", exc_info=True) + raise APIError(f"S3 export failed: {str(e)}") + + # HuggingFace Export (existing code) elif export_type == "huggingface" and request.hf_config: + # We still need to read the file for HuggingFace export + try: + with open(request.file_path, 'r') as f: + output_data = json.load(f) + except FileNotFoundError: + raise HTTPException(status_code=404, detail=f"File not found: {request.file_path}") + except json.JSONDecodeError as e: + raise HTTPException(status_code=400, detail=f"Invalid JSON file: {str(e)}") + self.logger.info(f"Creating HuggingFace dataset: {request.hf_config.hf_repo_name}") # Set up HuggingFace authentication @@ -132,7 +165,6 @@ def export(self,request:Export_synth): # Convert JSON to dataset dataset = self._create_dataset(output_data, request.output_key, request.output_value, request.file_path) - print(dataset) # Push to HuggingFace Hub as a dataset repo_id = f"{request.hf_config.hf_username}/{request.hf_config.hf_repo_name}" @@ -146,8 +178,8 @@ def export(self,request:Export_synth): self.logger.info(f"Dataset published to HuggingFace: {export_paths['huggingface']}") self.db.update_hf_path(file_name, export_paths['huggingface']) self.logger.info(f"Generation Metadata updated for hf_path: {export_paths['huggingface']}") - - return export_paths + + return export_paths except Exception as e: self.logger.error(f"Error saving results: {str(e)}", exc_info=True) diff --git a/app/services/s3_export.py b/app/services/s3_export.py new file mode 100644 index 0000000..ff5e774 --- /dev/null +++ b/app/services/s3_export.py @@ -0,0 +1,98 @@ +# In app/services/s3_export.py +import os +import logging +import boto3 +from botocore.exceptions import ClientError +from typing import Dict, Any, Optional + +def export_to_s3(file_path: str, bucket_name: str, key: str = "", + create_bucket: bool = True, access_key: str = None, + secret_key: str = None, region: str = None) -> Dict[str, str]: + """ + Export a dataset to AWS S3 + + Args: + file_path: Path to the JSON file to export + bucket_name: Name of the S3 bucket + key: Optional key name for the file in S3 (path/to/file.json) + create_bucket: Whether to create the bucket if it doesn't exist + access_key: AWS access key (defaults to environment variable) + secret_key: AWS secret key (defaults to environment variable) + region: AWS region (defaults to environment variable) + + Returns: + Dictionary with the S3 path of the exported file + """ + try: + # Check if file exists + if not os.path.exists(file_path): + raise ValueError(f"File not found: {file_path}") + + # Use provided credentials or environment variables + access_key = access_key or os.environ.get('AWS_ACCESS_KEY_ID') + secret_key = secret_key or os.environ.get('AWS_SECRET_ACCESS_KEY') + region = region or os.environ.get('AWS_DEFAULT_REGION') + + if not access_key or not secret_key: + raise ValueError("AWS credentials not provided and not found in environment variables") + + # Set up S3 client + s3_args = { + 'aws_access_key_id': access_key, + 'aws_secret_access_key': secret_key + } + + if region: + s3_args['region_name'] = region + + s3_client = boto3.client('s3', **s3_args) + + # Create key name if not provided + if not key: + key = os.path.basename(file_path) + + # Check if bucket exists + try: + s3_client.head_bucket(Bucket=bucket_name) + except ClientError as e: + error_code = e.response.get('Error', {}).get('Code', '') + if error_code == '404' and create_bucket: + # If bucket doesn't exist and create_bucket is True, create it + try: + if region and region != 'us-east-1': + s3_client.create_bucket( + Bucket=bucket_name, + CreateBucketConfiguration={'LocationConstraint': region} + ) + else: + s3_client.create_bucket(Bucket=bucket_name) + print(f"Bucket {bucket_name} created successfully") + except ClientError as create_error: + raise ValueError(f"Failed to create bucket: {str(create_error)}") + else: + # If there's another error or create_bucket is False + if error_code == '404': + raise ValueError(f"Bucket {bucket_name} does not exist and create_bucket is False") + else: + raise ValueError(f"Error accessing bucket: {str(e)}") + + # Upload file to S3 using upload_file method + try: + s3_client.upload_file( + file_path, + bucket_name, + key, + ExtraArgs={'ContentType': 'application/json'} + ) + except ClientError as e: + raise ValueError(f"Error uploading file to S3: {str(e)}") + + s3_path = f"s3://{bucket_name}/{key}" + print(f"File successfully uploaded to {s3_path}") + + return {'s3': s3_path} + + except Exception as e: + error_msg = f"Error exporting to S3: {str(e)}" + print(error_msg) + raise Exception(error_msg) \ No newline at end of file diff --git a/app/services/synthesis_job.py b/app/services/synthesis_job.py index fc3f02f..323fa7f 100644 --- a/app/services/synthesis_job.py +++ b/app/services/synthesis_job.py @@ -206,28 +206,49 @@ def evaluate_job(self, request: Any, cpu: int = 2, memory: int = 4, request_id = return {"job_name": job_name, "job_id": job_run.job_id} #@track_job("export") + # In the file containing synthesis_job def export_job(self, request: Any, cpu: int = 2, memory: int = 4) -> Dict[str, str]: """Create and run an export job""" params = request.model_dump() + # Generate job name based on export type + if "s3" in request.export_type and request.s3_config: + job_name_prefix = f"s3_{request.s3_config.bucket}" + elif "huggingface" in request.export_type and request.hf_config: + job_name_prefix = f"hf_{request.hf_config.hf_repo_name}" + else: + job_name_prefix = "export" + job_name, job_run, file_name = self._create_and_run_job( "run_export_job.py", - f"hf_{request.hf_config.hf_repo_name}", + job_name_prefix, params, cpu=cpu, memory=memory - ) - repo_id = f"{request.hf_config.hf_username}/{request.hf_config.hf_repo_name}" - export_path = f"https://huggingface.co/datasets/{repo_id}" + # Initialize export paths + export_paths = {} + + # Add HF export path if applicable + if "huggingface" in request.export_type and request.hf_config: + repo_id = f"{request.hf_config.hf_username}/{request.hf_config.hf_repo_name}" + export_paths['huggingface'] = f"https://huggingface.co/datasets/{repo_id}" + + # Add S3 export path if applicable + if "s3" in request.export_type and request.s3_config: + key = request.s3_config.key or os.path.basename(request.file_path) + if request.display_name and not request.s3_config.key: + key = f"{request.display_name}.json" + export_paths['s3'] = f"s3://{request.s3_config.bucket}/{key}" metadata = { "timestamp": datetime.now(timezone.utc).isoformat(), - "display_export_name": request.hf_config.hf_repo_name, + "display_export_name": request.display_name or os.path.basename(request.file_path), "display_name": request.display_name, "local_export_path": request.file_path, - "hf_export_path": export_path, + "hf_export_path": export_paths.get('huggingface', ''), + "s3_export_path": export_paths.get('s3', ''), "job_id": job_run.job_id, "job_name": job_name, "job_status": self.get_job_status(job_run.job_id), @@ -235,13 +256,21 @@ def export_job(self, request: Any, cpu: int = 2, memory: int = 4) -> Dict[str, s "cpu": cpu, "memory": memory } - + self.db_manager.save_export_metadata(metadata) - return { + + result = { "job_name": job_name, "job_id": job_run.job_id, - "hf_link": export_path } + + # Add export paths to result + if 'huggingface' in export_paths: + result["hf_link"] = export_paths['huggingface'] + if 's3' in export_paths: + result["s3_link"] = export_paths['s3'] + + return result def _calculate_total_count(self, request: Any) -> int: From 6cab63ecd6cde0d18af8171c0262cf155f028dfa Mon Sep 17 00:00:00 2001 From: Khauneesh-AI Date: Wed, 23 Apr 2025 22:57:30 +0530 Subject: [PATCH 05/10] raise erroor for in correct db update for s3 upload --- .gitignore | 1 + app/core/database.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d7fb6d4..c08e1e9 100644 --- a/.gitignore +++ b/.gitignore @@ -57,6 +57,7 @@ Khauneesh/ # DB *metadata.db-shm *metadata.db-wal +telemetry.db # Test and coverage reports .coverage diff --git a/app/core/database.py b/app/core/database.py index 8a94288..64433a8 100644 --- a/app/core/database.py +++ b/app/core/database.py @@ -1144,5 +1144,5 @@ def update_s3_path(self, file_name: str, s3_path: str): print(f"S3 path update successful for file: {file_name}") except Exception as e: print(f"Error updating S3 export path: {str(e)}") - raise + raise (f"Error updating S3 export path: {str(e)}") From bf22fed0fa2573efe3240d37fd2fd5175ea28b65 Mon Sep 17 00:00:00 2001 From: Khauneesh-AI Date: Tue, 29 Apr 2025 19:15:44 +0530 Subject: [PATCH 06/10] complete prompt endpoints for both generation and evaluation --- app/main.py | 145 ++++++++++++++++++++++++++++++++++- app/models/request_models.py | 1 + 2 files changed, 145 insertions(+), 1 deletion(-) diff --git a/app/main.py b/app/main.py index c8974a2..f062f9c 100644 --- a/app/main.py +++ b/app/main.py @@ -34,7 +34,7 @@ sys.path.append(str(ROOT_DIR)) from app.services.evaluator_service import EvaluatorService -from app.models.request_models import SynthesisRequest, EvaluationRequest, Export_synth, ModelParameters, CustomPromptRequest, JsonDataSize, RelativePath +from app.models.request_models import SynthesisRequest, EvaluationRequest, Export_synth, ModelParameters, CustomPromptRequest, JsonDataSize, RelativePath, Technique from app.services.synthesis_service import SynthesisService from app.services.export_results import Export_Service @@ -935,6 +935,149 @@ async def get_model_parameters() -> Dict: +@app.post("/complete_gen_prompt") +async def complete_prompt(request: SynthesisRequest): + """Allow users to see whole prompt which goes finally into LLM""" + try: + topic = request.topics[0] + batch_size = 5 if request.num_questions>=5 else request.num_questions + omit_questions = [] + + if request.technique == Technique.Freeform: + prompt = PromptBuilder.build_freeform_prompt( + model_id=request.model_id, + use_case=request.use_case, + topic=topic, + num_questions=batch_size, + omit_questions=omit_questions, + example_custom=request.example_custom or [], + example_path=request.example_path, + custom_prompt=request.custom_prompt, + schema=request.schema, + ) + elif request.technique == Technique.Custom_Workflow: + + inputs = [] + path = None # Initialize path + + try: + if not request.input_path: + raise ValueError("input_path must not be empty or None") + if not isinstance(request.input_path, (list, tuple)): + # Or handle a single string case if needed, e.g., path = request.input_path + raise TypeError("input_path must be a list or tuple of paths") + if not request.input_path[0]: + raise ValueError("First path in input_path is empty") + + path = request.input_path[0] + + except (ValueError, TypeError, IndexError) as e: # Catch specific errors for clarity + # Raise appropriate HTTP exception for bad request data + raise HTTPException(status_code=400, detail=f"Invalid input_path: {str(e)}") + except Exception as e: # Catch any other unexpected errors getting the path + raise HTTPException(status_code=500, detail=f"Unexpected error getting input path: {str(e)}") + + + # Proceed only if path was successfully retrieved + try: + with open(path) as f: + data = json.load(f) + + # Assuming data is a list of dicts + if not isinstance(data, list): + raise ValueError(f"Expected JSON data in {path} to be a list, but got {type(data).__name__}") + + inputs.extend(item.get(request.input_key, '') for item in data if isinstance(item, dict)) # Ensure item is a dict + + except FileNotFoundError: + raise HTTPException(status_code=404, detail=f"Input file not found: {path}") + except json.JSONDecodeError: + raise HTTPException(status_code=400, detail=f"Invalid JSON in file: {path}") + except ValueError as e: # For the list/dict structure check + raise HTTPException(status_code=400, detail=f"Invalid data structure in {path}: {str(e)}") + except Exception as e: # Catch any other unexpected file processing errors + raise HTTPException(status_code=500, detail=f"Error processing file {path}: {str(e)}") + + + # Check if inputs list is empty before accessing index 0 + if not inputs: + # Raise an error indicating no data was extracted based on the key + raise HTTPException(status_code=400, detail=f"No data extracted from {path} using key '{request.input_key}'. The file might be empty, the key might not exist, or the JSON structure is unexpected.") + + input_data = inputs[0] + + prompt = PromptBuilder.build_generate_result_prompt( + model_id=request.model_id, + use_case=request.use_case, + input=input_data, + examples=request.examples or [], + schema=request.schema, + custom_prompt=request.custom_prompt, + ) + elif request.technique == Technique.SFT: + prompt = PromptBuilder.build_prompt( + model_id=request.model_id, + use_case=request.use_case, + topic=topic, + num_questions=batch_size, + omit_questions=omit_questions, + examples=request.examples or [], + technique=request.technique, + schema=request.schema, + custom_prompt=request.custom_prompt, + ) + + return {"complete_prompt":prompt} + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + +@app.post("/complete_eval_prompt") +async def complete_prompt(request: EvaluationRequest): + """Allow users to see whole prompt which goes finally into LLM""" + try: + + + if request.technique == Technique.Freeform: + with open(request.import_path, 'r') as file: + data = json.load(file) + + # Ensure data is a list of rows + rows = data if isinstance(data, list) else [data] + prompt = PromptBuilder.build_freeform_eval_prompt( + request.model_id, + request.use_case, + rows[0], + request.examples, + request.custom_prompt + ) + + elif request.technique == Technique.SFT or request.technique == Technique.Custom_Workflow: + + with open(request.import_path, 'r') as file: + data = json.load(file) + qa_pairs = [{ + request.output_key: item.get(request.output_key, ''), # Use get() with default value + request.output_value: item.get(request.output_value, '') # Use get() with default value + } for item in data] + qa_pair = qa_pairs[0] + prompt = PromptBuilder.build_eval_prompt( + request.model_id, + request.use_case, + qa_pair[request.output_key], + qa_pair[request.output_value], + request.examples, + request.custom_prompt + ) + + return {"complete_prompt":prompt} + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + + + @app.get("/{use_case}/gen_prompt") async def customise_prompt(use_case: UseCase): diff --git a/app/models/request_models.py b/app/models/request_models.py index 951c060..e2980ee 100644 --- a/app/models/request_models.py +++ b/app/models/request_models.py @@ -195,6 +195,7 @@ class SynthesisResponse(BaseModel): class EvaluationRequest(BaseModel): """Request model for evaluating generated QA pairs""" use_case: UseCase + technique: Technique | None = Field(default=Technique.SFT) model_id: str import_path: Optional[str] = None import_type: str = "local" From 847cf88ecac6652117ac495cfe3d039a5e80853e Mon Sep 17 00:00:00 2001 From: Khauneesh-AI Date: Sun, 4 May 2025 06:27:37 +0530 Subject: [PATCH 07/10] CDP token for CAII inference --- .project-metadata.yaml | 6 +++- app/core/config.py | 67 ++++++++++++++++++++++++++++++-------- app/core/model_handlers.py | 4 ++- app/main.py | 38 +++++---------------- 4 files changed, 70 insertions(+), 45 deletions(-) diff --git a/.project-metadata.yaml b/.project-metadata.yaml index 85b9b1b..75b1abc 100644 --- a/.project-metadata.yaml +++ b/.project-metadata.yaml @@ -30,6 +30,10 @@ environment_variables: default: "your huggingface username" description: >- hf_username + CDP_TOKEN: + default: "API key for Cloudera AI Inference" + description: >- + CDP_TOKEN @@ -69,7 +73,7 @@ tasks: script: build/build_client.py arguments: None cpu: 2 - memory: 2 + memory: 4 short_summary: Create job to build client application environment: TASK_TYPE: CREATE/RUN_JOB diff --git a/app/core/config.py b/app/core/config.py index efd3405..498b6f5 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -5,6 +5,10 @@ import requests import json from fastapi.responses import JSONResponse +import os +from pathlib import Path +from dotenv import load_dotenv +load_dotenv() class UseCase(str, Enum): CODE_GENERATION = "code_generation" @@ -281,18 +285,55 @@ def get_examples_for_topic(use_case: UseCase, topic: str) -> List[Dict[str, str] } } +JWT_PATH = Path("/tmp/jwt") + +def _get_caii_token() -> str: + if (tok := os.getenv("CDP_TOKEN")): + return tok + try: + payload = json.loads(open(JWT_PATH).read()) + except FileNotFoundError: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="No CDP_TOKEN env‑var and no /tmp/jwt file") + except json.JSONDecodeError: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Malformed /tmp/jwt") + + if not (tok := payload.get("access_token")): + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="access_token missing in /tmp/jwt") + return tok + +def caii_check(endpoint: str, timeout: int = 3) -> requests.Response: + """ + Return the GET /models response if everything is healthy. + Raise HTTPException on *any* problem. + """ + if not endpoint: + raise HTTPException(400, "CAII endpoint not provided") + + token = _get_caii_token() + url = endpoint.removesuffix("/chat/completions") + "/models" + + try: + r = requests.get(url, + headers={"Authorization": f"Bearer {token}"}, + timeout=timeout) + except requests.exceptions.RequestException as exc: + raise HTTPException(503, f"CAII endpoint unreachable: {exc}") + + if r.status_code in (401, 403): + raise HTTPException(403, "Token is valid but has no access to this environment") + if r.status_code == 404: + raise HTTPException(404, "CAII endpoint or resource not found") + if 500 <= r.status_code < 600: + raise HTTPException(503, "CAII endpoint is downscaled; retry in ~15 min") + if r.status_code != 200: + raise HTTPException(r.status_code, r.text) + + return r -def caii_check(caii_endpoint): - API_KEY = json.load(open("/tmp/jwt"))["access_token"] - headers = { - "Authorization": f"Bearer {API_KEY}" - } - - - if caii_endpoint: - caii_endpoint = caii_endpoint.removesuffix('/chat/completions') - caii_endpoint = caii_endpoint + "/models" - response = requests.get(caii_endpoint, headers=headers, timeout=3) # Will raise RequestException if fails - - return response diff --git a/app/core/model_handlers.py b/app/core/model_handlers.py index 74ce723..64ae81f 100644 --- a/app/core/model_handlers.py +++ b/app/core/model_handlers.py @@ -11,6 +11,7 @@ from openai import OpenAI from app.core.exceptions import APIError, InvalidModelError, ModelHandlerError, JSONParsingError from app.core.telemetry_integration import track_llm_operation +from app.core.config import _get_caii_token @@ -280,7 +281,8 @@ def _handle_bedrock_request(self, prompt: str, retry_with_reduced_tokens: bool): def _handle_caii_request(self, prompt: str): """Original CAII implementation""" try: - API_KEY = json.load(open("/tmp/jwt"))["access_token"] + #API_KEY = json.load(open("/tmp/jwt"))["access_token"] + API_KEY = _get_caii_token() MODEL_ID = self.model_id caii_endpoint = self.caii_endpoint diff --git a/app/main.py b/app/main.py index f062f9c..165da51 100644 --- a/app/main.py +++ b/app/main.py @@ -408,15 +408,10 @@ async def generate_examples(request: SynthesisRequest): # Generate a request ID request_id = str(uuid.uuid4()) - if request.inference_type== "CAII": + if request.inference_type == "CAII": caii_endpoint = request.caii_endpoint - response = caii_check(caii_endpoint) - message = "The CAII endpoint you are tring to reach is downscaled, please try after >15 minutes while it autoscales, meanwhile please try another model" - if response.status_code != 200: - return JSONResponse( - status_code=503, # Service Unavailable - content={"status": "failed", "error": message} - ) + + caii_check(caii_endpoint) is_demo = request.is_demo @@ -464,13 +459,7 @@ async def generate_freeform_data(request: SynthesisRequest): if request.inference_type == "CAII": caii_endpoint = request.caii_endpoint - response = caii_check(caii_endpoint) - message = "The CAII endpoint you are trying to reach is downscaled, please try after >15 minutes while it autoscales, meanwhile please try another model" - if response.status_code != 200: - return JSONResponse( - status_code=503, # Service Unavailable - content={"status": "failed", "error": message} - ) + caii_check(caii_endpoint) is_demo = request.is_demo mem = 4 @@ -514,15 +503,9 @@ async def evaluate_examples(request: EvaluationRequest): """Evaluate generated QA pairs""" request_id = str(uuid.uuid4()) - if request.inference_type== "CAII": + if request.inference_type == "CAII": caii_endpoint = request.caii_endpoint - response = caii_check(caii_endpoint) - message = "The CAII endpoint you are tring to reach is downscaled, please try after >15 minutes while it autoscales, meanwhile please try another model" - if response.status_code != 200: - return JSONResponse( - status_code=503, # Service Unavailable - content={"status": "failed", "error": message} - ) + caii_check(caii_endpoint) is_demo = request.is_demo if is_demo: @@ -541,13 +524,8 @@ async def evaluate_freeform(request: EvaluationRequest): if request.inference_type == "CAII": caii_endpoint = request.caii_endpoint - response = caii_check(caii_endpoint) - message = "The CAII endpoint you are trying to reach is downscaled, please try after >15 minutes while it autoscales, meanwhile please try another model" - if response.status_code != 200: - return JSONResponse( - status_code=503, # Service Unavailable - content={"status": "failed", "error": message} - ) + caii_check(caii_endpoint) + is_demo = getattr(request, 'is_demo', True) if is_demo: From 915f553e85b3c9be2db914ce2a1f069a818a9af4 Mon Sep 17 00:00:00 2001 From: Khauneesh-AI Date: Mon, 5 May 2025 23:41:38 +0530 Subject: [PATCH 08/10] analyzer for prompt assist --- app/core/config.py | 89 +++++++ app/core/prompt_templates.py | 369 ++++++++++++++++++++++++++---- app/main.py | 52 ++++- app/models/request_models.py | 1 + app/services/synthesis_service.py | 43 ++-- pyproject.toml | 10 +- uv.lock | 198 ++++++++++++++++ 7 files changed, 692 insertions(+), 70 deletions(-) diff --git a/app/core/config.py b/app/core/config.py index 498b6f5..ea8d6f1 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -336,4 +336,93 @@ def caii_check(endpoint: str, timeout: int = 3) -> requests.Response: return r +LENDING_DATA_PROMPT = """ + Create profile data for the LendingClub company which specialises in lending various types of loans to urban customers. + + Background: + LendingClub is a peer-to-peer lending platform connecting borrowers with investors. The dataset captures loan applications, + borrower profiles, and outcomes to assess credit risk, predict defaults, and determine interest rates. + + + Loan Record field: + + Each generated record must include the following fields in the exact order provided, with values generated as specified: + + - loan_amnt: The listed amount of the loan applied for by the borrower. If at some point in time, the credit department + reduces the loan amount, then it will be reflected in this value. + - term: The number of payments on the loan. Values are in months and can be either " 36 months" or " 60 months". + - int_rate: Interest Rate on the loan + - installment: The monthly payment owed by the borrower if the loan originates. + - grade: LC assigned loan grade (Possible values: A, B, C, D, E, F, G) + - sub_grade: LC assigned loan subgrade (Possible sub-values: 1-5 i.e. A5) + - emp_title: The job title supplied by the Borrower when applying for the loan. + - emp_length: Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 + means ten or more years. + - home_ownership: The home ownership status provided by the borrower during registration or obtained from the credit report. + Possible values are: RENT, OWN, MORTGAGE, ANY, OTHER + - annual_inc: The self-reported annual income provided by the borrower during registration. + - verification_status: Indicates if income was verified by LC, not verified, or if the income source was verified + - issue_d: The month which the loan was funded + - loan_status: Current status of the loan (Possible values: "Fully Paid", "Charged Off") + - purpose: A category provided by the borrower for the loan request. + - title: The loan title provided by the borrower + - dti: A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage + and the requested LC loan, divided by the borrower’s self-reported monthly income. + - earliest_cr_line: The month the borrower's earliest reported credit line was opened + - open_acc: The number of open credit lines in the borrower's credit file. + - pub_rec: Number of derogatory public records + - revol_bal: Total credit revolving balance + - revol_util: Revolving line utilization rate, or the amount of credit the borrower is using relative to all available + revolving credit. + - total_acc: The total number of credit lines currently in the borrower's credit file + - initial_list_status: The initial listing status of the loan. Possible values are: w, f + - application_type: Indicates whether the loan is an individual application or a joint application with two co-borrowers + - mort_acc: Number of mortgage accounts. + - pub_rec_bankruptcies: Number of public record bankruptcies + - address: The physical address of the person + + In addition to the definitions above, when generating samples, adhere to following guidelines: + + Privacy Compliance guidelines: + 1) Ensure PII from examples such as addresses are not used in the generated data to minimize any privacy concerns. + 2) Avoid real PII in addresses. Use generic street names and cities. + + Formatting guidelines: + 1) Use consistent decimal precision (e.g., "10000.00" for loan_amnt). + 2) Dates (e.g. issue_d, earliest_cr_line) should follow the "Jan-YYYY" format. + 3) term has a leading space before the number of months (i.e. " 36 months") + 4) The address field is a special case where the State zipcode needs to be exactly as specified in the seed instructions. + The persons address must follow the format as specified in the examples with the State zipcode coming last. + 5) Any other formatting guidelines that can be inferred from the examples or field definitions but are not listed above. + + Cross-row guidelines: + 1) Generated data should maintain consistency with all statistical parameters and distributions defined in the seed instruction + across records (e.g., 60% of `term` as " 36 months"). + + Cross-column guidelines: + 1) Ensure logical and realistic consistency and correlations between variables. Examples include but not limited to: + a) Grade/Sub-grade consistency: Sub-grade must match the grade (e.g., "B" grade → "B1" to "B5"). + b) Interest Rate vs Grade/Subgrade relationship: Higher subgrades (e.g., A5) could have higher `int_rate` than lower subgrades (e.g., A3). + c) Mortgage Consistency: `mort_acc` should be 1 or more if `home_ownership` is `MORTGAGE`. + d) Open Accounts: `open_acc` ≤ `total_acc`. + + Data distribution guidelines: + 1) Continuous Variables (e.g., `loan_amnt`, `annual_inc`): Adhere to the mean and standard deviation given in the seed + instructions for each variable. + 2) Categorical variables (e.g., `term`, `home_ownership`): Use probability distributions given in the seed instructions + (e.g. 60% for " 36 months", 40% for " 60 months"). + 3) Discrete Variables (e.g., `pub_rec`, `mort_acc`): Adhere to value ranges and statistical parameters + provided in the seed instructions. + 4) Any other logical data distribution guidelines that can be inferred from the seed instructions or field definitions + and are not specified above. + + Background knowledge and realism guidelines: + 1) Ensure fields such as interest rates reflect real-world interest rates at the time the loan is issued. + 2) Generate values that are plausible (e.g., `annual_inc` ≤ $500,000 for most `emp_length` ranges). + 3) Avoid unrealistic values (e.g., `revol_util` as "200%" is unrealistic). + 4) Ensure that the generated data is realistic and plausible, avoiding extreme or impossible values. + 5) Ensure that the generated data is diverse and not repetitive, avoiding identical or very similar records. + 6) Ensure that the generated data is coherent and consistent, avoiding contradictions or inconsistencies between fields. + 7) Ensure that the generated data is relevant to the LendingClub use case and adheres to the guidelines provided.""" + diff --git a/app/core/prompt_templates.py b/app/core/prompt_templates.py index ad838ee..9785636 100644 --- a/app/core/prompt_templates.py +++ b/app/core/prompt_templates.py @@ -2,8 +2,13 @@ import json import csv import os +import pandas as pd +import numpy as np from app.models.request_models import Example, Example_eval -from app.core.config import UseCase, Technique, ModelFamily, get_model_family,USE_CASE_CONFIGS +from app.core.config import UseCase, Technique, ModelFamily, get_model_family,USE_CASE_CONFIGS, LENDING_DATA_PROMPT +from app.core.data_loader import DataLoader +from app.core.data_analyser import DataAnalyser +from app.core.summary_formatter import SummaryFormatter DEFAULT_SCHEMA = """CREATE TABLE employees ( id INT PRIMARY KEY, @@ -580,52 +585,303 @@ def get_freeform_eval_prompt(model_id: str, return final_prompt - @staticmethod - def create_custom_prompt(model_id: str, - custom_prompt:str - ) -> str: + # @staticmethod + # def create_custom_prompt(model_id: str, + # custom_prompt:str, + # example_path: Optional[str], + # ) -> str: - final_instruction = f"""You are a brilliant prompt engineer. Your job is to create a best prompt for provided task: {custom_prompt} which can get - best response from large language model - The prompt should Focus on: + # final_instruction = f"""You are a brilliant prompt engineer. Your job is to create a best prompt for provided task: {custom_prompt} which can get + # best response from large language model + # The prompt should Focus on: + + # - The core task objective + # - Key aspects to consider or maintain + # - Any special requirements specific to the task. + # For example the prompt for code generation is below + # {DEFAULT_CODE_GENERATION_PROMPT} + # Make sure you just give the prompt in your response which can be directly used by large language model. + # No need to give any explanation but just the prompt in same format as the example given above. + # """ + # model_family = get_model_family(model_id) + + # if model_family== ModelFamily.LLAMA: + + # final_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>" + '\n' + final_instruction + '\n' + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" - - The core task objective - - Key aspects to consider or maintain - - Any special requirements specific to the task. - for example the prompt for code generation is below - {DEFAULT_CODE_GENERATION_PROMPT} - Make sure you just give the prompt in your response which can be directly used by large language model. - No need to give any explanation but just the prompt in same format as the example given above. - """ - model_family = get_model_family(model_id) + # elif model_family== ModelFamily.MISTRAL: + + # final_prompt = '[INST]' + "\n" + final_instruction + '\n' + '[/INST]' + + # elif model_family == ModelFamily.CLAUDE: + # final_prompt = "\n" + final_instruction + + # elif model_family== ModelFamily.QWEN: + # system_prompt = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant." + + # final_prompt = f'''<|im_start|>system + # {system_prompt}<|im_end|> + # <|im_start|>user + + # {final_instruction}<|im_end|> + # <|im_start|>assistant + # ''' + + + # else: + # final_prompt = "\n" + final_instruction + # return final_prompt + + def create_custom_prompt( + model_id: str, + custom_prompt: str, + example_path: str | None = None, +) -> str: + """ + Create a custom prompt for a language model, optionally including dataset analysis. - if model_family== ModelFamily.LLAMA: + Args: + model_id: The ID of the model to create the prompt for + custom_prompt: The base custom prompt text + example_path: Optional path to an example dataset - final_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>" + '\n' + final_instruction + '\n' + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" + Returns: + A formatted prompt suitable for the specified model + """ + summary_block = "" + example_block = "" - elif model_family== ModelFamily.MISTRAL: + if example_path: + print(f"Loading example data from: {example_path}") + try: + df = DataLoader.load(example_path) + #print(f"Loaded DataFrame with shape: {df.shape}") + + # Apply type inference to improve analysis + df = DataLoader.infer_dtypes(df) + + if "error_message" in df.columns and len(df.columns) == 1: + # Data loading failed + print(f"Error loading data: {df['error_message'][0]}") + # Keep summary and example blocks as empty strings + elif not df.empty: + # ---------- build summary block ---------- + try: + print("Analyzing data...") + summary_dict = DataAnalyser.analyse(df) + + # Create a more structured summary with explanations + summary_block = ( + "\n" + "INSTRUCTIONS: The following analysis provides key insights about the dataset that should guide your synthetic data generation. Use these signals to match distributions and relationships when generating synthetic data.\n\n" + ) + + # Add section for columns classification + if "columns" in summary_dict: + summary_block += (""" + ## Column Types\n + "These are all columns identified in the dataset in given specific order:\n\n + Make sure to provide definitions of each column in the same order as they are in the dataset. + Don't change or skip any column name or order. + """) + + + + + summary_block += "\n".join(f"- {col}" for col in summary_dict["columns"]) + "\n\n" + + # Add section for statistical analysis + if "statistical_analysis" in summary_dict: + summary_block += ( + "## Statistical Analysis\n" + "These statistics describe the distributions of values in the dataset:\n\n" + ) + + if "numeric" in summary_dict["statistical_analysis"]: + summary_block += ( + "### Numeric Statistics\n" + "Key statistics for numeric columns (mean, median, min, max, etc.):\n" + f"{json.dumps(summary_dict['statistical_analysis']['numeric'], indent=2)}\n\n" + ) + + if "categorical" in summary_dict["statistical_analysis"]: + summary_block += ( + "### Categorical Statistics\n" + "Distribution of values in categorical columns:\n" + f"{json.dumps(summary_dict['statistical_analysis']['categorical'], indent=2)}\n\n" + ) + + if "datetime" in summary_dict["statistical_analysis"]: + summary_block += ( + "### DateTime Statistics\n" + "Temporal patterns and ranges in datetime columns:\n" + f"{json.dumps(summary_dict['statistical_analysis']['datetime'], indent=2)}\n\n" + ) + + # Add section for cross-row relationships + if "cross_row_relationship" in summary_dict: + summary_block += ( + "## Cross-Row Relationships\n" + "These insights describe patterns across rows in the dataset:\n\n" + f"{json.dumps(summary_dict['cross_row_relationship'], indent=2)}\n\n" + ) + + # Add section for cross-column relationships + if "cross_column_relationship" in summary_dict: + summary_block += ( + "## Cross-Column Relationships\n" + "These insights describe correlations and dependencies between columns:\n\n" + f"{json.dumps(summary_dict['cross_column_relationship'], indent=2)}\n\n" + ) + + # Close the data summary block + summary_block += "\n" + + print("Data analysis completed successfully.") + + except Exception as e: + # Analysis failed → keep summary_block as empty string + print(f"Error in data analysis: {str(e)}") + # Do NOT add any error messages to blocks + + # ---------- build example block ---------- + try: + print("Creating CSV snippet...") + csv_snippet = SummaryFormatter.first_rows_block(df) + example_block = ( + "\n" + "INSTRUCTIONS: The CSV snippet shows the first 10 rows of the " + "original dataset. Preserve this column order, header names, " + "and data types while creating new rows. " + "Use this to create a comprehensive list of all columns and their definitions. " + "Make sure the list covers all details and columns which will be required " + "to create data.\n" + f"{csv_snippet}" + "\n" + ) + print("CSV snippet created successfully.") + except Exception as e: + # Snippet failed → keep example_block as empty string + print(f"Error creating CSV snippet: {str(e)}") + # Do NOT add any error messages to blocks + except Exception as e: + print(f"Error processing example file: {str(e)}") + # Keep blocks as empty strings + # Do NOT add any error messages to blocks + + # Construct the final instruction with proper error handling for missing constants + try: - final_prompt = '[INST]' + "\n" + final_instruction + '\n' + '[/INST]' + + if example_path: + #Construct the final instruction + final_instruction = f"""You are a brilliant prompt engineer. + Your task: **{custom_prompt}** - elif model_family == ModelFamily.CLAUDE: - final_prompt = "\n" + final_instruction + {summary_block}{example_block}Return **only** the finished prompt that can be sent directly to a language model. + Now that you have complete information about the task, follow the below instructions to create prompt. - elif model_family== ModelFamily.QWEN: - system_prompt = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant." + - Look at column list and include all columns in your prompt with their definitions. the list should be exhaustive and cover all columns. + - Make sure to have all statistical analysis , cross-row and cross-column relationships in your prompt. + - The prmpt should be absolutely clear in its final goal and there should not be any ambiguity or vagueness in the prompt. + - The prompt should be clear and exhaustive in its column details. + + + A few examples are given below for your reference + Code Generation: + + {DEFAULT_CODE_GENERATION_PROMPT} + + Lending Data Generation: + {LENDING_DATA_PROMPT} + + Make sure you just give the prompt in your response which can be directly used by large language model. + No need to give any explanation but just the prompt in same format as the example given above. + Never mention how many rows or dataset size needs to be generated in the final output. + + """ + else: + + final_instruction = f"""You are a brilliant prompt engineer. + Your task: **{custom_prompt}** + + {summary_block}{example_block} + + Return a well-crafted prompt that focuses on: + - The core task objective + - Clear and exhaustive column details + - Key aspects to consider or maintain + - Special requirements for the task + + A few examples are given below for your reference + Code Generation: + + {DEFAULT_CODE_GENERATION_PROMPT} + + Text to SQL: + {DEFAULT_TEXT2SQL_PROMPT} + + Make sure you just give the prompt in your response which can be directly used by large language model. + No need to give any explanation but just the prompt in same format as the example given above. + Never mention how many rows or dataset size needs to be generated in the final output. + """ + except Exception as e: + print(f"Error constructing instruction template: {str(e)}") + # Fallback to a simpler template that still includes any successful blocks + final_instruction = f"""You are a brilliant prompt engineer. + Your task: **{custom_prompt}** + + {summary_block}{example_block} + + Return a well-crafted prompt that focuses on: + - The core task objective + - Clear and exhaustive column details + - Key aspects to consider or maintain + - Special requirements for the task + + A few examples are given below for your reference + Code Generation: + + {DEFAULT_CODE_GENERATION_PROMPT} + + Text to SQL: + {DEFAULT_TEXT2SQL_PROMPT} + + Make sure you just give the prompt in your response which can be directly used by large language model. + No need to give any explanation but just the prompt in same format as the example given above. + Never mention how many rows or dataset size needs to be generated in the final output. + """ + + # Format according to model family + try: + family = get_model_family(model_id) - final_prompt = f'''<|im_start|>system - {system_prompt}<|im_end|> + if family == ModelFamily.LLAMA: + return "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n" \ + f"{final_instruction}\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>" + elif family == ModelFamily.MISTRAL: + return f"[INST]\n{final_instruction}\n[/INST]" + elif family == ModelFamily.CLAUDE: + return "\n" + final_instruction + elif family == ModelFamily.QWEN: + system = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant." + return f"""<|im_start|>system + {system}<|im_end|> <|im_start|>user - + {final_instruction}<|im_end|> <|im_start|>assistant - ''' - - - else: - final_prompt = "\n" + final_instruction - return final_prompt + """ + else: + # Default format if model family is unknown + return "\n" + final_instruction + except Exception as e: + print(f"Error formatting for model family: {str(e)}") + # Return the raw instruction if formatting fails + return final_instruction + + @staticmethod def generate_result_prompt(model_id: str, @@ -715,20 +971,32 @@ def get_freeform_prompt(model_id: str, ) -> str: if example_path: - file_extension = os.path.splitext(example_path)[1].lower() - - with open(example_path, 'r') as f: - if file_extension == '.json': - # Handle JSON files - example_upload = json.load(f) - examples_str = json.dumps(example_upload, indent=2) - elif file_extension == '.csv': - # Handle CSV files - csv_reader = csv.DictReader(f) - example_upload = list(csv_reader) - examples_str = json.dumps(example_upload, indent=2) # Convert CSV data to JSON string format - else: - raise ValueError(f"Unsupported file extension: {file_extension}. Only .json and .csv are supported.") + try: + # Use DataLoader to load the file, limiting to 10 rows + df = DataLoader.load(example_path, sample_rows=10) + + # Convert DataFrame to list of dictionaries + example_upload = df.head(10).to_dict(orient='records') + + # Handle non-serializable objects + def json_serializable(obj): + if isinstance(obj, (pd.Timestamp, np.datetime64)): + return obj.isoformat() + elif isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + else: + return str(obj) + + # Convert to JSON string with custom serialization + examples_str = json.dumps(example_upload, indent=2, default=json_serializable) + + except Exception as e: + print(f"Error processing example file: {str(e)}") + examples_str = "" else: if example_custom: @@ -876,10 +1144,11 @@ def build_generate_result_prompt(model_id: str, @staticmethod def build_custom_prompt(model_id: str, - custom_prompt = Optional[str] + custom_prompt = Optional[str], + example_path= Optional[str] ) -> str: - return ModelPrompts.create_custom_prompt(model_id, custom_prompt) + return ModelPrompts.create_custom_prompt(model_id, custom_prompt, example_path) @staticmethod def build_freeform_prompt(model_id: str, diff --git a/app/main.py b/app/main.py index 165da51..216eb8d 100644 --- a/app/main.py +++ b/app/main.py @@ -1,6 +1,7 @@ import os import boto3 from datetime import datetime, timezone +from typing import Any, Dict from botocore.config import Config from fastapi import FastAPI, HTTPException, Request, status from fastapi.responses import JSONResponse @@ -9,6 +10,8 @@ from fastapi.staticfiles import StaticFiles from fastapi.responses import FileResponse from pydantic import BaseModel +import pandas as pd +import numpy as np from typing import Dict, List, Optional import subprocess import asyncio @@ -22,6 +25,7 @@ import sys import json import uuid +from fastapi.encoders import jsonable_encoder print(os.getcwd()) # Setup absolute paths ROOT_DIR = Path(__file__).parent.parent # Goes up one level from app/main.py to reach project root @@ -180,7 +184,34 @@ def restart_application(): print(f"Error restarting application: {e}") raise - +def deep_sanitize_nans(obj: Any) -> Any: + """ + Recursively traverse all data structures and replace NaN with None. + This handles all nested structures. + """ + if isinstance(obj, dict): + return {k: deep_sanitize_nans(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [deep_sanitize_nans(item) for item in obj] + elif isinstance(obj, tuple): + return tuple(deep_sanitize_nans(item) for item in obj) + elif isinstance(obj, set): + return {deep_sanitize_nans(item) for item in obj} + elif isinstance(obj, float) and (math.isnan(obj) or math.isinf(obj)): + return None + elif isinstance(obj, (pd.Timestamp, np.datetime64)): + return obj.isoformat() + elif isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + if np.isnan(obj) or np.isinf(obj): + return None + return float(obj) + elif isinstance(obj, np.ndarray): + return deep_sanitize_nans(obj.tolist()) + elif pd.isna(obj): + return None + return obj # Add these models @@ -276,7 +307,9 @@ def get_timeout_for_request(request: Request) -> float: if path.endswith("/generate"): return 200.0 # 2 minutes for generation elif path.endswith("/freeform"): - return 200.0 # 2 minutes for generation + return 300.0 # 5 minutes for generation + elif path.endswith("/create_custom_prompt"): + return 300.0 # 5 minutes for generation elif path.endswith("/evaluate"): return 200.0 # 2 minutes for evaluation elif path.endswith("/export_results"): @@ -284,7 +317,7 @@ def get_timeout_for_request(request: Request) -> float: elif "health" in path: return 5.0 # Quick timeout for health checks elif path.endswith("/upgrade"): - return 1200 # timeout increase for upgrade + return 2000 # timeout increase for upgrade else: return 60.0 # Default timeout @@ -486,13 +519,21 @@ async def generate_freeform_data(request: SynthesisRequest): core = 2 if is_demo: - return await synthesis_service.generate_freeform(request, is_demo=is_demo, request_id=request_id ) + result = await synthesis_service.generate_freeform(request, is_demo=is_demo, request_id=request_id ) + # Apply our deep sanitization to handle all NaN values + sanitized_result = deep_sanitize_nans(result) + + # Then use jsonable_encoder for FastAPI-specific conversions + final_result = jsonable_encoder(sanitized_result) + + return final_result else: # Pass additional parameter to indicate this is a freeform request request_dict = request.model_dump() freeform = True # Convert back to SynthesisRequest object freeform_request = SynthesisRequest(**request_dict) + return synthesis_job.generate_job(freeform_request, core, mem, request_id=request_id, freeform = freeform) @app.post("/synthesis/evaluate", @@ -605,8 +646,9 @@ async def create_custom_prompt(request: CustomPromptRequest, request_id = None): prompt = PromptBuilder.build_custom_prompt( model_id=request.model_id, custom_prompt=request.custom_prompt, + example_path= request.example_path ) - #print(prompt) + print(prompt) prompt_gen = model_handler.generate_response(prompt, request_id=request_id) return {"generated_prompt":prompt_gen} diff --git a/app/models/request_models.py b/app/models/request_models.py index e2980ee..bf47e5d 100644 --- a/app/models/request_models.py +++ b/app/models/request_models.py @@ -240,6 +240,7 @@ class CustomPromptRequest(BaseModel): inference_type :Optional[str] = "aws_bedrock" caii_endpoint: Optional[str] = None + example_path: Optional[str] = None custom_p:bool =True model_config = ConfigDict(protected_namespaces=(), diff --git a/app/services/synthesis_service.py b/app/services/synthesis_service.py index 30a630a..5edf9d7 100644 --- a/app/services/synthesis_service.py +++ b/app/services/synthesis_service.py @@ -14,6 +14,9 @@ import asyncio from fastapi import FastAPI, BackgroundTasks, HTTPException from app.core.exceptions import APIError, InvalidModelError, ModelHandlerError, JSONParsingError +from app.core.data_loader import DataLoader +import pandas as pd +import numpy as np from app.models.request_models import SynthesisRequest, Example, ModelParameters from app.core.model_handlers import create_handler @@ -1017,20 +1020,32 @@ async def generate_freeform(self, request: SynthesisRequest, job_name=None, is_d # For examples if request.example_path: - file_extension = os.path.splitext(request.example_path)[1].lower() - - with open(request.example_path, 'r') as f: - if file_extension == '.json': - # Handle JSON files - example_upload = json.load(f) - examples_str = json.dumps(example_upload, indent=2) - elif file_extension == '.csv': - # Handle CSV files - csv_reader = csv.DictReader(f) - example_upload = list(csv_reader) - examples_str = json.dumps(example_upload, indent=2) # Convert CSV data to JSON string format - else: - raise ValueError(f"Unsupported file extension: {file_extension}. Only .json and .csv are supported.") + try: + # Use DataLoader to load the file, limiting to 10 rows + df = DataLoader.load(request.example_path, sample_rows=10) + + # Convert DataFrame to list of dictionaries + example_upload = df.head(10).to_dict(orient='records') + + # Handle non-serializable objects + def json_serializable(obj): + if isinstance(obj, (pd.Timestamp, np.datetime64)): + return obj.isoformat() + elif isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + else: + return str(obj) + + # Convert to JSON string with custom serialization + examples_str = json.dumps(example_upload, indent=2, default=json_serializable) + + except Exception as e: + print(f"Error processing example file: {str(e)}") + examples_str = "" else: examples_value = request.example_custom if hasattr(request, 'example_custom') else None examples_str = self.safe_json_dumps(examples_value) diff --git a/pyproject.toml b/pyproject.toml index 965dbe1..864e889 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ version = "0.1.0" description = "Synthetic Data Studio Project" requires-python = ">=3.10" dependencies = [ + # ── existing ───────────────────────────────────────────── "fastapi==0.109.2", "uvicorn==0.27.1", "pydantic==2.10.4", @@ -31,8 +32,15 @@ dependencies = [ "pytest-asyncio==0.25.3", "pytest-cov==6.0.0", "psutil==5.9.8", + "pandas>=2.2.3", + # ── new packages for data‑analysis layer ───────────────── + "numpy>=1.24.0", # explicit, for dcor/scipy (already a transitive dep of pandas) + "scipy>=1.12.0", # chi‑square, ANOVA, etc. + "dcor>=0.6", # distance‑correlation metric + "openpyxl>=3.1.2", # read .xlsx files + "pyxlsb>=1.0.9", # read .xlsb files ] [tool.hatch.build.targets.wheel] -packages = ["app"] \ No newline at end of file +packages = ["app"] diff --git a/uv.lock b/uv.lock index a50b04f..dded261 100644 --- a/uv.lock +++ b/uv.lock @@ -134,12 +134,17 @@ dependencies = [ { name = "boto3" }, { name = "botocore" }, { name = "datasets" }, + { name = "dcor" }, { name = "fastapi" }, { name = "httpx" }, { name = "huggingface-hub" }, { name = "loguru" }, { name = "nest-asyncio" }, + { name = "numpy" }, { name = "openai" }, + { name = "openpyxl" }, + { name = "pandas" }, + { name = "psutil" }, { name = "pydantic" }, { name = "pyflakes" }, { name = "pymupdf" }, @@ -148,6 +153,8 @@ dependencies = [ { name = "pytest-cov" }, { name = "python-docx" }, { name = "python-dotenv" }, + { name = "pyxlsb" }, + { name = "scipy" }, { name = "sqlalchemy" }, { name = "typing-extensions" }, { name = "uvicorn" }, @@ -160,12 +167,17 @@ requires-dist = [ { name = "boto3", specifier = "==1.35.48" }, { name = "botocore", specifier = "==1.35.48" }, { name = "datasets", specifier = "==2.20.0" }, + { name = "dcor", specifier = ">=0.6" }, { name = "fastapi", specifier = "==0.109.2" }, { name = "httpx", specifier = "==0.27.2" }, { name = "huggingface-hub", specifier = "==0.23.5" }, { name = "loguru", specifier = "==0.7.2" }, { name = "nest-asyncio", specifier = "==1.5.8" }, + { name = "numpy", specifier = ">=1.24.0" }, { name = "openai", specifier = "==1.57.2" }, + { name = "openpyxl", specifier = ">=3.1.2" }, + { name = "pandas", specifier = ">=2.2.3" }, + { name = "psutil", specifier = "==5.9.8" }, { name = "pydantic", specifier = "==2.10.4" }, { name = "pyflakes", specifier = "==3.2.0" }, { name = "pymupdf", specifier = "==1.25.1" }, @@ -174,6 +186,8 @@ requires-dist = [ { name = "pytest-cov", specifier = "==6.0.0" }, { name = "python-docx", specifier = "==1.1.2" }, { name = "python-dotenv", specifier = "==1.0.0" }, + { name = "pyxlsb", specifier = ">=1.0.9" }, + { name = "scipy", specifier = ">=1.12.0" }, { name = "sqlalchemy", specifier = "==2.0.38" }, { name = "typing-extensions", specifier = "==4.12.2" }, { name = "uvicorn", specifier = "==0.27.1" }, @@ -448,6 +462,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/60/2d/963b266bb8f88492d5ab4232d74292af8beb5b6fdae97902df9e284d4c32/datasets-2.20.0-py3-none-any.whl", hash = "sha256:76ac02e3bdfff824492e20678f0b6b1b6d080515957fe834b00c2ba8d6b18e5e", size = 547777 }, ] +[[package]] +name = "dcor" +version = "0.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "joblib" }, + { name = "numba" }, + { name = "numpy" }, + { name = "scipy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/00/a7/1d06e98f1b123be60ba5de004edba510025da689c8cfb501299a8f2ba1d1/dcor-0.6.tar.gz", hash = "sha256:f5d39776101db4787348e6be6cd9369341efeb40b070509a30d5c57185558431", size = 45509 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/45/f3/49770c523067d2179a600f236ea6d55f0a02909a424d055dbc50e04c4860/dcor-0.6-py3-none-any.whl", hash = "sha256:de306fc666668188749730fc803fc1d4d804d9886c92b622ba57b434fed395a2", size = 55545 }, +] + [[package]] name = "dill" version = "0.3.8" @@ -475,6 +504,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/44/8a15e45ffa96e6cf82956dd8d7af9e666357e16b0d93b253903475ee947f/docutils-0.16-py2.py3-none-any.whl", hash = "sha256:0c5b78adfbf7762415433f5515cd5c9e762339e23369dbe8000d84a4bf4ab3af", size = 548181 }, ] +[[package]] +name = "et-xmlfile" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059 }, +] + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -783,6 +821,43 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/31/b4/b9b800c45527aadd64d5b442f9b932b00648617eb5d63d2c7a6587b7cafc/jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980", size = 20256 }, ] +[[package]] +name = "joblib" +version = "1.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/30/08/8bd4a0250247861420a040b33ccf42f43c426ac91d99405374ef117e5872/joblib-1.5.0.tar.gz", hash = "sha256:d8757f955389a3dd7a23152e43bc297c2e0c2d3060056dad0feefc88a06939b5", size = 330234 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/da/d3/13ee227a148af1c693654932b8b0b02ed64af5e1f7406d56b088b57574cd/joblib-1.5.0-py3-none-any.whl", hash = "sha256:206144b320246485b712fc8cc51f017de58225fa8b414a1fe1764a7231aca491", size = 307682 }, +] + +[[package]] +name = "llvmlite" +version = "0.44.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/89/6a/95a3d3610d5c75293d5dbbb2a76480d5d4eeba641557b69fe90af6c5b84e/llvmlite-0.44.0.tar.gz", hash = "sha256:07667d66a5d150abed9157ab6c0b9393c9356f229784a4385c02f99e94fc94d4", size = 171880 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/41/75/d4863ddfd8ab5f6e70f4504cf8cc37f4e986ec6910f4ef8502bb7d3c1c71/llvmlite-0.44.0-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:9fbadbfba8422123bab5535b293da1cf72f9f478a65645ecd73e781f962ca614", size = 28132306 }, + { url = "https://files.pythonhosted.org/packages/37/d9/6e8943e1515d2f1003e8278819ec03e4e653e2eeb71e4d00de6cfe59424e/llvmlite-0.44.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cccf8eb28f24840f2689fb1a45f9c0f7e582dd24e088dcf96e424834af11f791", size = 26201096 }, + { url = "https://files.pythonhosted.org/packages/aa/46/8ffbc114def88cc698906bf5acab54ca9fdf9214fe04aed0e71731fb3688/llvmlite-0.44.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7202b678cdf904823c764ee0fe2dfe38a76981f4c1e51715b4cb5abb6cf1d9e8", size = 42361859 }, + { url = "https://files.pythonhosted.org/packages/30/1c/9366b29ab050a726af13ebaae8d0dff00c3c58562261c79c635ad4f5eb71/llvmlite-0.44.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:40526fb5e313d7b96bda4cbb2c85cd5374e04d80732dd36a282d72a560bb6408", size = 41184199 }, + { url = "https://files.pythonhosted.org/packages/69/07/35e7c594b021ecb1938540f5bce543ddd8713cff97f71d81f021221edc1b/llvmlite-0.44.0-cp310-cp310-win_amd64.whl", hash = "sha256:41e3839150db4330e1b2716c0be3b5c4672525b4c9005e17c7597f835f351ce2", size = 30332381 }, + { url = "https://files.pythonhosted.org/packages/b5/e2/86b245397052386595ad726f9742e5223d7aea999b18c518a50e96c3aca4/llvmlite-0.44.0-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:eed7d5f29136bda63b6d7804c279e2b72e08c952b7c5df61f45db408e0ee52f3", size = 28132305 }, + { url = "https://files.pythonhosted.org/packages/ff/ec/506902dc6870249fbe2466d9cf66d531265d0f3a1157213c8f986250c033/llvmlite-0.44.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ace564d9fa44bb91eb6e6d8e7754977783c68e90a471ea7ce913bff30bd62427", size = 26201090 }, + { url = "https://files.pythonhosted.org/packages/99/fe/d030f1849ebb1f394bb3f7adad5e729b634fb100515594aca25c354ffc62/llvmlite-0.44.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5d22c3bfc842668168a786af4205ec8e3ad29fb1bc03fd11fd48460d0df64c1", size = 42361858 }, + { url = "https://files.pythonhosted.org/packages/d7/7a/ce6174664b9077fc673d172e4c888cb0b128e707e306bc33fff8c2035f0d/llvmlite-0.44.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f01a394e9c9b7b1d4e63c327b096d10f6f0ed149ef53d38a09b3749dcf8c9610", size = 41184200 }, + { url = "https://files.pythonhosted.org/packages/5f/c6/258801143975a6d09a373f2641237992496e15567b907a4d401839d671b8/llvmlite-0.44.0-cp311-cp311-win_amd64.whl", hash = "sha256:d8489634d43c20cd0ad71330dde1d5bc7b9966937a263ff1ec1cebb90dc50955", size = 30331193 }, + { url = "https://files.pythonhosted.org/packages/15/86/e3c3195b92e6e492458f16d233e58a1a812aa2bfbef9bdd0fbafcec85c60/llvmlite-0.44.0-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:1d671a56acf725bf1b531d5ef76b86660a5ab8ef19bb6a46064a705c6ca80aad", size = 28132297 }, + { url = "https://files.pythonhosted.org/packages/d6/53/373b6b8be67b9221d12b24125fd0ec56b1078b660eeae266ec388a6ac9a0/llvmlite-0.44.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5f79a728e0435493611c9f405168682bb75ffd1fbe6fc360733b850c80a026db", size = 26201105 }, + { url = "https://files.pythonhosted.org/packages/cb/da/8341fd3056419441286c8e26bf436923021005ece0bff5f41906476ae514/llvmlite-0.44.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0143a5ef336da14deaa8ec26c5449ad5b6a2b564df82fcef4be040b9cacfea9", size = 42361901 }, + { url = "https://files.pythonhosted.org/packages/53/ad/d79349dc07b8a395a99153d7ce8b01d6fcdc9f8231355a5df55ded649b61/llvmlite-0.44.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d752f89e31b66db6f8da06df8b39f9b91e78c5feea1bf9e8c1fba1d1c24c065d", size = 41184247 }, + { url = "https://files.pythonhosted.org/packages/e2/3b/a9a17366af80127bd09decbe2a54d8974b6d8b274b39bf47fbaedeec6307/llvmlite-0.44.0-cp312-cp312-win_amd64.whl", hash = "sha256:eae7e2d4ca8f88f89d315b48c6b741dcb925d6a1042da694aa16ab3dd4cbd3a1", size = 30332380 }, + { url = "https://files.pythonhosted.org/packages/89/24/4c0ca705a717514c2092b18476e7a12c74d34d875e05e4d742618ebbf449/llvmlite-0.44.0-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:319bddd44e5f71ae2689859b7203080716448a3cd1128fb144fe5c055219d516", size = 28132306 }, + { url = "https://files.pythonhosted.org/packages/01/cf/1dd5a60ba6aee7122ab9243fd614abcf22f36b0437cbbe1ccf1e3391461c/llvmlite-0.44.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9c58867118bad04a0bb22a2e0068c693719658105e40009ffe95c7000fcde88e", size = 26201090 }, + { url = "https://files.pythonhosted.org/packages/d2/1b/656f5a357de7135a3777bd735cc7c9b8f23b4d37465505bd0eaf4be9befe/llvmlite-0.44.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46224058b13c96af1365290bdfebe9a6264ae62fb79b2b55693deed11657a8bf", size = 42361904 }, + { url = "https://files.pythonhosted.org/packages/d8/e1/12c5f20cb9168fb3464a34310411d5ad86e4163c8ff2d14a2b57e5cc6bac/llvmlite-0.44.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aa0097052c32bf721a4efc03bd109d335dfa57d9bffb3d4c24cc680711b8b4fc", size = 41184245 }, + { url = "https://files.pythonhosted.org/packages/d0/81/e66fc86539293282fd9cb7c9417438e897f369e79ffb62e1ae5e5154d4dd/llvmlite-0.44.0-cp313-cp313-win_amd64.whl", hash = "sha256:2fb7c4f2fb86cbae6dca3db9ab203eeea0e22d73b99bc2341cdf9de93612e930", size = 30331193 }, +] + [[package]] name = "loguru" version = "0.7.2" @@ -1047,6 +1122,38 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ab/d3/48c01d1944e0ee49fdc005bf518a68b0582d3bd201e5401664890b62a647/nest_asyncio-1.5.8-py3-none-any.whl", hash = "sha256:accda7a339a70599cb08f9dd09a67e0c2ef8d8d6f4c07f96ab203f2ae254e48d", size = 5268 }, ] +[[package]] +name = "numba" +version = "0.61.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "llvmlite" }, + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1c/a0/e21f57604304aa03ebb8e098429222722ad99176a4f979d34af1d1ee80da/numba-0.61.2.tar.gz", hash = "sha256:8750ee147940a6637b80ecf7f95062185ad8726c8c28a2295b8ec1160a196f7d", size = 2820615 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/eb/ca/f470be59552ccbf9531d2d383b67ae0b9b524d435fb4a0d229fef135116e/numba-0.61.2-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:cf9f9fc00d6eca0c23fc840817ce9f439b9f03c8f03d6246c0e7f0cb15b7162a", size = 2775663 }, + { url = "https://files.pythonhosted.org/packages/f5/13/3bdf52609c80d460a3b4acfb9fdb3817e392875c0d6270cf3fd9546f138b/numba-0.61.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ea0247617edcb5dd61f6106a56255baab031acc4257bddaeddb3a1003b4ca3fd", size = 2778344 }, + { url = "https://files.pythonhosted.org/packages/e2/7d/bfb2805bcfbd479f04f835241ecf28519f6e3609912e3a985aed45e21370/numba-0.61.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ae8c7a522c26215d5f62ebec436e3d341f7f590079245a2f1008dfd498cc1642", size = 3824054 }, + { url = "https://files.pythonhosted.org/packages/e3/27/797b2004745c92955470c73c82f0e300cf033c791f45bdecb4b33b12bdea/numba-0.61.2-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:bd1e74609855aa43661edffca37346e4e8462f6903889917e9f41db40907daa2", size = 3518531 }, + { url = "https://files.pythonhosted.org/packages/b1/c6/c2fb11e50482cb310afae87a997707f6c7d8a48967b9696271347441f650/numba-0.61.2-cp310-cp310-win_amd64.whl", hash = "sha256:ae45830b129c6137294093b269ef0a22998ccc27bf7cf096ab8dcf7bca8946f9", size = 2831612 }, + { url = "https://files.pythonhosted.org/packages/3f/97/c99d1056aed767503c228f7099dc11c402906b42a4757fec2819329abb98/numba-0.61.2-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:efd3db391df53aaa5cfbee189b6c910a5b471488749fd6606c3f33fc984c2ae2", size = 2775825 }, + { url = "https://files.pythonhosted.org/packages/95/9e/63c549f37136e892f006260c3e2613d09d5120672378191f2dc387ba65a2/numba-0.61.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:49c980e4171948ffebf6b9a2520ea81feed113c1f4890747ba7f59e74be84b1b", size = 2778695 }, + { url = "https://files.pythonhosted.org/packages/97/c8/8740616c8436c86c1b9a62e72cb891177d2c34c2d24ddcde4c390371bf4c/numba-0.61.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3945615cd73c2c7eba2a85ccc9c1730c21cd3958bfcf5a44302abae0fb07bb60", size = 3829227 }, + { url = "https://files.pythonhosted.org/packages/fc/06/66e99ae06507c31d15ff3ecd1f108f2f59e18b6e08662cd5f8a5853fbd18/numba-0.61.2-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:bbfdf4eca202cebade0b7d43896978e146f39398909a42941c9303f82f403a18", size = 3523422 }, + { url = "https://files.pythonhosted.org/packages/0f/a4/2b309a6a9f6d4d8cfba583401c7c2f9ff887adb5d54d8e2e130274c0973f/numba-0.61.2-cp311-cp311-win_amd64.whl", hash = "sha256:76bcec9f46259cedf888041b9886e257ae101c6268261b19fda8cfbc52bec9d1", size = 2831505 }, + { url = "https://files.pythonhosted.org/packages/b4/a0/c6b7b9c615cfa3b98c4c63f4316e3f6b3bbe2387740277006551784218cd/numba-0.61.2-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:34fba9406078bac7ab052efbf0d13939426c753ad72946baaa5bf9ae0ebb8dd2", size = 2776626 }, + { url = "https://files.pythonhosted.org/packages/92/4a/fe4e3c2ecad72d88f5f8cd04e7f7cff49e718398a2fac02d2947480a00ca/numba-0.61.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4ddce10009bc097b080fc96876d14c051cc0c7679e99de3e0af59014dab7dfe8", size = 2779287 }, + { url = "https://files.pythonhosted.org/packages/9a/2d/e518df036feab381c23a624dac47f8445ac55686ec7f11083655eb707da3/numba-0.61.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b1bb509d01f23d70325d3a5a0e237cbc9544dd50e50588bc581ba860c213546", size = 3885928 }, + { url = "https://files.pythonhosted.org/packages/10/0f/23cced68ead67b75d77cfcca3df4991d1855c897ee0ff3fe25a56ed82108/numba-0.61.2-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:48a53a3de8f8793526cbe330f2a39fe9a6638efcbf11bd63f3d2f9757ae345cd", size = 3577115 }, + { url = "https://files.pythonhosted.org/packages/68/1d/ddb3e704c5a8fb90142bf9dc195c27db02a08a99f037395503bfbc1d14b3/numba-0.61.2-cp312-cp312-win_amd64.whl", hash = "sha256:97cf4f12c728cf77c9c1d7c23707e4d8fb4632b46275f8f3397de33e5877af18", size = 2831929 }, + { url = "https://files.pythonhosted.org/packages/0b/f3/0fe4c1b1f2569e8a18ad90c159298d862f96c3964392a20d74fc628aee44/numba-0.61.2-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:3a10a8fc9afac40b1eac55717cece1b8b1ac0b946f5065c89e00bde646b5b154", size = 2771785 }, + { url = "https://files.pythonhosted.org/packages/e9/71/91b277d712e46bd5059f8a5866862ed1116091a7cb03bd2704ba8ebe015f/numba-0.61.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7d3bcada3c9afba3bed413fba45845f2fb9cd0d2b27dd58a1be90257e293d140", size = 2773289 }, + { url = "https://files.pythonhosted.org/packages/0d/e0/5ea04e7ad2c39288c0f0f9e8d47638ad70f28e275d092733b5817cf243c9/numba-0.61.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bdbca73ad81fa196bd53dc12e3aaf1564ae036e0c125f237c7644fe64a4928ab", size = 3893918 }, + { url = "https://files.pythonhosted.org/packages/17/58/064f4dcb7d7e9412f16ecf80ed753f92297e39f399c905389688cf950b81/numba-0.61.2-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:5f154aaea625fb32cfbe3b80c5456d514d416fcdf79733dd69c0df3a11348e9e", size = 3584056 }, + { url = "https://files.pythonhosted.org/packages/af/a4/6d3a0f2d3989e62a18749e1e9913d5fa4910bbb3e3311a035baea6caf26d/numba-0.61.2-cp313-cp313-win_amd64.whl", hash = "sha256:59321215e2e0ac5fa928a8020ab00b8e57cda8a97384963ac0dfa4d4e6aa54e7", size = 2831846 }, +] + [[package]] name = "numpy" version = "2.2.3" @@ -1128,6 +1235,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/37/e7/95437fb676381e927d4cb3f9f8dd90ed24cfd264f572db4d395037428594/openai-1.57.2-py3-none-any.whl", hash = "sha256:f7326283c156fdee875746e7e54d36959fb198eadc683952ee05e3302fbd638d", size = 389873 }, ] +[[package]] +name = "openpyxl" +version = "3.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "et-xmlfile" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910 }, +] + [[package]] name = "packaging" version = "24.2" @@ -1283,6 +1402,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b5/35/6c4c6fc8774a9e3629cd750dc24a7a4fb090a25ccd5c3246d127b70f9e22/propcache-0.3.0-py3-none-any.whl", hash = "sha256:67dda3c7325691c2081510e92c561f465ba61b975f481735aefdfc845d2cd043", size = 12101 }, ] +[[package]] +name = "psutil" +version = "5.9.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/90/c7/6dc0a455d111f68ee43f27793971cf03fe29b6ef972042549db29eec39a2/psutil-5.9.8.tar.gz", hash = "sha256:6be126e3225486dff286a8fb9a06246a5253f4c7c53b475ea5f5ac934e64194c", size = 503247 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/e3/07ae864a636d70a8a6f58da27cb1179192f1140d5d1da10886ade9405797/psutil-5.9.8-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:aee678c8720623dc456fa20659af736241f575d79429a0e5e9cf88ae0605cc81", size = 248702 }, + { url = "https://files.pythonhosted.org/packages/b3/bd/28c5f553667116b2598b9cc55908ec435cb7f77a34f2bff3e3ca765b0f78/psutil-5.9.8-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8cb6403ce6d8e047495a701dc7c5bd788add903f8986d523e3e20b98b733e421", size = 285242 }, + { url = "https://files.pythonhosted.org/packages/c5/4f/0e22aaa246f96d6ac87fe5ebb9c5a693fbe8877f537a1022527c47ca43c5/psutil-5.9.8-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d06016f7f8625a1825ba3732081d77c94589dca78b7a3fc072194851e88461a4", size = 288191 }, + { url = "https://files.pythonhosted.org/packages/6e/f5/2aa3a4acdc1e5940b59d421742356f133185667dd190b166dbcfcf5d7b43/psutil-5.9.8-cp37-abi3-win32.whl", hash = "sha256:bc56c2a1b0d15aa3eaa5a60c9f3f8e3e565303b465dbf57a1b730e7a2b9844e0", size = 251252 }, + { url = "https://files.pythonhosted.org/packages/93/52/3e39d26feae7df0aa0fd510b14012c3678b36ed068f7d78b8d8784d61f0e/psutil-5.9.8-cp37-abi3-win_amd64.whl", hash = "sha256:8db4c1b57507eef143a15a6884ca10f7c73876cdf5d51e713151c1236a0e68cf", size = 255090 }, + { url = "https://files.pythonhosted.org/packages/05/33/2d74d588408caedd065c2497bdb5ef83ce6082db01289a1e1147f6639802/psutil-5.9.8-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d16bbddf0693323b8c6123dd804100241da461e41d6e332fb0ba6058f630f8c8", size = 249898 }, +] + [[package]] name = "pyarrow" version = "19.0.1" @@ -1541,6 +1674,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/eb/38/ac33370d784287baa1c3d538978b5e2ea064d4c1b93ffbd12826c190dd10/pytz-2025.1-py2.py3-none-any.whl", hash = "sha256:89dd22dca55b46eac6eda23b2d72721bf1bdfef212645d81513ef5d03038de57", size = 507930 }, ] +[[package]] +name = "pyxlsb" +version = "1.0.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3f/13/eebaeb7a40b062d1c6f7f91d09e73d30a69e33e4baa7cbe4b7658548b1cd/pyxlsb-1.0.10.tar.gz", hash = "sha256:8062d1ea8626d3f1980e8b1cfe91a4483747449242ecb61013bc2df85435f685", size = 22424 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/92/345823838ae367c59b63e03aef9c331f485370f9df6d049256a61a28f06d/pyxlsb-1.0.10-py2.py3-none-any.whl", hash = "sha256:87c122a9a622e35ca5e741d2e541201d28af00fb46bec492cfa9586890b120b4", size = 23849 }, +] + [[package]] name = "pyyaml" version = "6.0.2" @@ -1624,6 +1766,62 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/66/05/7957af15543b8c9799209506df4660cba7afc4cf94bfb60513827e96bed6/s3transfer-0.10.4-py3-none-any.whl", hash = "sha256:244a76a24355363a68164241438de1b72f8781664920260c48465896b712a41e", size = 83175 }, ] +[[package]] +name = "scipy" +version = "1.15.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b7/b9/31ba9cd990e626574baf93fbc1ac61cf9ed54faafd04c479117517661637/scipy-1.15.2.tar.gz", hash = "sha256:cd58a314d92838f7e6f755c8a2167ead4f27e1fd5c1251fd54289569ef3495ec", size = 59417316 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/95/df/ef233fff6838fe6f7840d69b5ef9f20d2b5c912a8727b21ebf876cb15d54/scipy-1.15.2-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:a2ec871edaa863e8213ea5df811cd600734f6400b4af272e1c011e69401218e9", size = 38692502 }, + { url = "https://files.pythonhosted.org/packages/5c/20/acdd4efb8a68b842968f7bc5611b1aeb819794508771ad104de418701422/scipy-1.15.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:6f223753c6ea76983af380787611ae1291e3ceb23917393079dcc746ba60cfb5", size = 30085508 }, + { url = "https://files.pythonhosted.org/packages/42/55/39cf96ca7126f1e78ee72a6344ebdc6702fc47d037319ad93221063e6cf4/scipy-1.15.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:ecf797d2d798cf7c838c6d98321061eb3e72a74710e6c40540f0e8087e3b499e", size = 22359166 }, + { url = "https://files.pythonhosted.org/packages/51/48/708d26a4ab8a1441536bf2dfcad1df0ca14a69f010fba3ccbdfc02df7185/scipy-1.15.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:9b18aa747da280664642997e65aab1dd19d0c3d17068a04b3fe34e2559196cb9", size = 25112047 }, + { url = "https://files.pythonhosted.org/packages/dd/65/f9c5755b995ad892020381b8ae11f16d18616208e388621dfacc11df6de6/scipy-1.15.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87994da02e73549dfecaed9e09a4f9d58a045a053865679aeb8d6d43747d4df3", size = 35536214 }, + { url = "https://files.pythonhosted.org/packages/de/3c/c96d904b9892beec978562f64d8cc43f9cca0842e65bd3cd1b7f7389b0ba/scipy-1.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69ea6e56d00977f355c0f84eba69877b6df084516c602d93a33812aa04d90a3d", size = 37646981 }, + { url = "https://files.pythonhosted.org/packages/3d/74/c2d8a24d18acdeae69ed02e132b9bc1bb67b7bee90feee1afe05a68f9d67/scipy-1.15.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:888307125ea0c4466287191e5606a2c910963405ce9671448ff9c81c53f85f58", size = 37230048 }, + { url = "https://files.pythonhosted.org/packages/42/19/0aa4ce80eca82d487987eff0bc754f014dec10d20de2f66754fa4ea70204/scipy-1.15.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9412f5e408b397ff5641080ed1e798623dbe1ec0d78e72c9eca8992976fa65aa", size = 40010322 }, + { url = "https://files.pythonhosted.org/packages/d0/d2/f0683b7e992be44d1475cc144d1f1eeae63c73a14f862974b4db64af635e/scipy-1.15.2-cp310-cp310-win_amd64.whl", hash = "sha256:b5e025e903b4f166ea03b109bb241355b9c42c279ea694d8864d033727205e65", size = 41233385 }, + { url = "https://files.pythonhosted.org/packages/40/1f/bf0a5f338bda7c35c08b4ed0df797e7bafe8a78a97275e9f439aceb46193/scipy-1.15.2-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:92233b2df6938147be6fa8824b8136f29a18f016ecde986666be5f4d686a91a4", size = 38703651 }, + { url = "https://files.pythonhosted.org/packages/de/54/db126aad3874601048c2c20ae3d8a433dbfd7ba8381551e6f62606d9bd8e/scipy-1.15.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:62ca1ff3eb513e09ed17a5736929429189adf16d2d740f44e53270cc800ecff1", size = 30102038 }, + { url = "https://files.pythonhosted.org/packages/61/d8/84da3fffefb6c7d5a16968fe5b9f24c98606b165bb801bb0b8bc3985200f/scipy-1.15.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:4c6676490ad76d1c2894d77f976144b41bd1a4052107902238047fb6a473e971", size = 22375518 }, + { url = "https://files.pythonhosted.org/packages/44/78/25535a6e63d3b9c4c90147371aedb5d04c72f3aee3a34451f2dc27c0c07f/scipy-1.15.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:a8bf5cb4a25046ac61d38f8d3c3426ec11ebc350246a4642f2f315fe95bda655", size = 25142523 }, + { url = "https://files.pythonhosted.org/packages/e0/22/4b4a26fe1cd9ed0bc2b2cb87b17d57e32ab72c346949eaf9288001f8aa8e/scipy-1.15.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a8e34cf4c188b6dd004654f88586d78f95639e48a25dfae9c5e34a6dc34547e", size = 35491547 }, + { url = "https://files.pythonhosted.org/packages/32/ea/564bacc26b676c06a00266a3f25fdfe91a9d9a2532ccea7ce6dd394541bc/scipy-1.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28a0d2c2075946346e4408b211240764759e0fabaeb08d871639b5f3b1aca8a0", size = 37634077 }, + { url = "https://files.pythonhosted.org/packages/43/c2/bfd4e60668897a303b0ffb7191e965a5da4056f0d98acfb6ba529678f0fb/scipy-1.15.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:42dabaaa798e987c425ed76062794e93a243be8f0f20fff6e7a89f4d61cb3d40", size = 37231657 }, + { url = "https://files.pythonhosted.org/packages/4a/75/5f13050bf4f84c931bcab4f4e83c212a36876c3c2244475db34e4b5fe1a6/scipy-1.15.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6f5e296ec63c5da6ba6fa0343ea73fd51b8b3e1a300b0a8cae3ed4b1122c7462", size = 40035857 }, + { url = "https://files.pythonhosted.org/packages/b9/8b/7ec1832b09dbc88f3db411f8cdd47db04505c4b72c99b11c920a8f0479c3/scipy-1.15.2-cp311-cp311-win_amd64.whl", hash = "sha256:597a0c7008b21c035831c39927406c6181bcf8f60a73f36219b69d010aa04737", size = 41217654 }, + { url = "https://files.pythonhosted.org/packages/4b/5d/3c78815cbab499610f26b5bae6aed33e227225a9fa5290008a733a64f6fc/scipy-1.15.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c4697a10da8f8765bb7c83e24a470da5797e37041edfd77fd95ba3811a47c4fd", size = 38756184 }, + { url = "https://files.pythonhosted.org/packages/37/20/3d04eb066b471b6e171827548b9ddb3c21c6bbea72a4d84fc5989933910b/scipy-1.15.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:869269b767d5ee7ea6991ed7e22b3ca1f22de73ab9a49c44bad338b725603301", size = 30163558 }, + { url = "https://files.pythonhosted.org/packages/a4/98/e5c964526c929ef1f795d4c343b2ff98634ad2051bd2bbadfef9e772e413/scipy-1.15.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:bad78d580270a4d32470563ea86c6590b465cb98f83d760ff5b0990cb5518a93", size = 22437211 }, + { url = "https://files.pythonhosted.org/packages/1d/cd/1dc7371e29195ecbf5222f9afeedb210e0a75057d8afbd942aa6cf8c8eca/scipy-1.15.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:b09ae80010f52efddb15551025f9016c910296cf70adbf03ce2a8704f3a5ad20", size = 25232260 }, + { url = "https://files.pythonhosted.org/packages/f0/24/1a181a9e5050090e0b5138c5f496fee33293c342b788d02586bc410c6477/scipy-1.15.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a6fd6eac1ce74a9f77a7fc724080d507c5812d61e72bd5e4c489b042455865e", size = 35198095 }, + { url = "https://files.pythonhosted.org/packages/c0/53/eaada1a414c026673eb983f8b4a55fe5eb172725d33d62c1b21f63ff6ca4/scipy-1.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2b871df1fe1a3ba85d90e22742b93584f8d2b8e6124f8372ab15c71b73e428b8", size = 37297371 }, + { url = "https://files.pythonhosted.org/packages/e9/06/0449b744892ed22b7e7b9a1994a866e64895363572677a316a9042af1fe5/scipy-1.15.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:03205d57a28e18dfd39f0377d5002725bf1f19a46f444108c29bdb246b6c8a11", size = 36872390 }, + { url = "https://files.pythonhosted.org/packages/6a/6f/a8ac3cfd9505ec695c1bc35edc034d13afbd2fc1882a7c6b473e280397bb/scipy-1.15.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:601881dfb761311045b03114c5fe718a12634e5608c3b403737ae463c9885d53", size = 39700276 }, + { url = "https://files.pythonhosted.org/packages/f5/6f/e6e5aff77ea2a48dd96808bb51d7450875af154ee7cbe72188afb0b37929/scipy-1.15.2-cp312-cp312-win_amd64.whl", hash = "sha256:e7c68b6a43259ba0aab737237876e5c2c549a031ddb7abc28c7b47f22e202ded", size = 40942317 }, + { url = "https://files.pythonhosted.org/packages/53/40/09319f6e0f276ea2754196185f95cd191cb852288440ce035d5c3a931ea2/scipy-1.15.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:01edfac9f0798ad6b46d9c4c9ca0e0ad23dbf0b1eb70e96adb9fa7f525eff0bf", size = 38717587 }, + { url = "https://files.pythonhosted.org/packages/fe/c3/2854f40ecd19585d65afaef601e5e1f8dbf6758b2f95b5ea93d38655a2c6/scipy-1.15.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:08b57a9336b8e79b305a143c3655cc5bdbe6d5ece3378578888d2afbb51c4e37", size = 30100266 }, + { url = "https://files.pythonhosted.org/packages/dd/b1/f9fe6e3c828cb5930b5fe74cb479de5f3d66d682fa8adb77249acaf545b8/scipy-1.15.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:54c462098484e7466362a9f1672d20888f724911a74c22ae35b61f9c5919183d", size = 22373768 }, + { url = "https://files.pythonhosted.org/packages/15/9d/a60db8c795700414c3f681908a2b911e031e024d93214f2d23c6dae174ab/scipy-1.15.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:cf72ff559a53a6a6d77bd8eefd12a17995ffa44ad86c77a5df96f533d4e6c6bb", size = 25154719 }, + { url = "https://files.pythonhosted.org/packages/37/3b/9bda92a85cd93f19f9ed90ade84aa1e51657e29988317fabdd44544f1dd4/scipy-1.15.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9de9d1416b3d9e7df9923ab23cd2fe714244af10b763975bea9e4f2e81cebd27", size = 35163195 }, + { url = "https://files.pythonhosted.org/packages/03/5a/fc34bf1aa14dc7c0e701691fa8685f3faec80e57d816615e3625f28feb43/scipy-1.15.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb530e4794fc8ea76a4a21ccb67dea33e5e0e60f07fc38a49e821e1eae3b71a0", size = 37255404 }, + { url = "https://files.pythonhosted.org/packages/4a/71/472eac45440cee134c8a180dbe4c01b3ec247e0338b7c759e6cd71f199a7/scipy-1.15.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5ea7ed46d437fc52350b028b1d44e002646e28f3e8ddc714011aaf87330f2f32", size = 36860011 }, + { url = "https://files.pythonhosted.org/packages/01/b3/21f890f4f42daf20e4d3aaa18182dddb9192771cd47445aaae2e318f6738/scipy-1.15.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:11e7ad32cf184b74380f43d3c0a706f49358b904fa7d5345f16ddf993609184d", size = 39657406 }, + { url = "https://files.pythonhosted.org/packages/0d/76/77cf2ac1f2a9cc00c073d49e1e16244e389dd88e2490c91d84e1e3e4d126/scipy-1.15.2-cp313-cp313-win_amd64.whl", hash = "sha256:a5080a79dfb9b78b768cebf3c9dcbc7b665c5875793569f48bf0e2b1d7f68f6f", size = 40961243 }, + { url = "https://files.pythonhosted.org/packages/4c/4b/a57f8ddcf48e129e6054fa9899a2a86d1fc6b07a0e15c7eebff7ca94533f/scipy-1.15.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:447ce30cee6a9d5d1379087c9e474628dab3db4a67484be1b7dc3196bfb2fac9", size = 38870286 }, + { url = "https://files.pythonhosted.org/packages/0c/43/c304d69a56c91ad5f188c0714f6a97b9c1fed93128c691148621274a3a68/scipy-1.15.2-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:c90ebe8aaa4397eaefa8455a8182b164a6cc1d59ad53f79943f266d99f68687f", size = 30141634 }, + { url = "https://files.pythonhosted.org/packages/44/1a/6c21b45d2548eb73be9b9bff421aaaa7e85e22c1f9b3bc44b23485dfce0a/scipy-1.15.2-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:def751dd08243934c884a3221156d63e15234a3155cf25978b0a668409d45eb6", size = 22415179 }, + { url = "https://files.pythonhosted.org/packages/74/4b/aefac4bba80ef815b64f55da06f62f92be5d03b467f2ce3668071799429a/scipy-1.15.2-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:302093e7dfb120e55515936cb55618ee0b895f8bcaf18ff81eca086c17bd80af", size = 25126412 }, + { url = "https://files.pythonhosted.org/packages/b1/53/1cbb148e6e8f1660aacd9f0a9dfa2b05e9ff1cb54b4386fe868477972ac2/scipy-1.15.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cd5b77413e1855351cdde594eca99c1f4a588c2d63711388b6a1f1c01f62274", size = 34952867 }, + { url = "https://files.pythonhosted.org/packages/2c/23/e0eb7f31a9c13cf2dca083828b97992dd22f8184c6ce4fec5deec0c81fcf/scipy-1.15.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d0194c37037707b2afa7a2f2a924cf7bac3dc292d51b6a925e5fcb89bc5c776", size = 36890009 }, + { url = "https://files.pythonhosted.org/packages/03/f3/e699e19cabe96bbac5189c04aaa970718f0105cff03d458dc5e2b6bd1e8c/scipy-1.15.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:bae43364d600fdc3ac327db99659dcb79e6e7ecd279a75fe1266669d9a652828", size = 36545159 }, + { url = "https://files.pythonhosted.org/packages/af/f5/ab3838e56fe5cc22383d6fcf2336e48c8fe33e944b9037fbf6cbdf5a11f8/scipy-1.15.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f031846580d9acccd0044efd1a90e6f4df3a6e12b4b6bd694a7bc03a89892b28", size = 39136566 }, + { url = "https://files.pythonhosted.org/packages/0a/c8/b3f566db71461cabd4b2d5b39bcc24a7e1c119535c8361f81426be39bb47/scipy-1.15.2-cp313-cp313t-win_amd64.whl", hash = "sha256:fe8a9eb875d430d81755472c5ba75e84acc980e4a8f6204d402849234d3017db", size = 40477705 }, +] + [[package]] name = "six" version = "1.17.0" From f883e3a4c813c5bcad47a321038522b3eb6559de Mon Sep 17 00:00:00 2001 From: Khauneesh-AI Date: Tue, 6 May 2025 21:01:18 +0530 Subject: [PATCH 09/10] added analyser files required for previous commit --- app/core/data_analyser.py | 357 ++++++++++++++++++++++++++++++++++ app/core/data_loader.py | 160 +++++++++++++++ app/core/summary_formatter.py | 51 +++++ 3 files changed, 568 insertions(+) create mode 100644 app/core/data_analyser.py create mode 100644 app/core/data_loader.py create mode 100644 app/core/summary_formatter.py diff --git a/app/core/data_analyser.py b/app/core/data_analyser.py new file mode 100644 index 0000000..977de10 --- /dev/null +++ b/app/core/data_analyser.py @@ -0,0 +1,357 @@ +import pandas as pd +import numpy as np +import warnings +from typing import Dict, List, Any, Union, Optional, Tuple +import math + +class DataAnalyser: + """Utility class for analyzing datasets and providing statistical insights.""" + + @classmethod + def analyse(cls, df: pd.DataFrame, correlation_threshold: float = 0.7) -> Dict[str, Any]: + """ + Analyze a DataFrame and extract useful statistics and insights. + + Args: + df: Input DataFrame to analyze + correlation_threshold: Threshold for identifying strong correlations + + Returns: + Dictionary containing analysis results + """ + print("Analyzing data...") + + # Initialize results structure + results = {"columns": [], + "grp_columns": {}, + "statistical_analysis": {}, + "cross_row_relationship": {}, + "cross_column_relationship": {} + } + + # Categorize columns + results["grp_columns"] = cls.categorize_columns(df) + results["columns"]= df.columns.tolist() + + # Analyze each type of column + stats = {} + if results["grp_columns"]["numeric"]: + stats["numeric"] = cls.analyze_numeric_columns(df, results["grp_columns"]["numeric"]) + + if results["grp_columns"]["categorical"]: + stats["categorical"] = cls.analyze_categorical_columns(df, results["grp_columns"]["categorical"]) + + if results["grp_columns"]["datetime"]: + stats["datetime"] = cls.analyze_datetime_columns(df, results["grp_columns"]["datetime"]) + + results["statistical_analysis"] = stats + + # Analyze cross-row relationships + results["cross_row_relationship"] = cls.analyze_cross_row_relationships(df) + + # Analyze cross-column relationships + if results["grp_columns"]["numeric"] and len(results["grp_columns"]["numeric"]) > 1: + results["cross_column_relationship"] = cls.analyze_cross_column_relationships( + df, results["grp_columns"]["numeric"], correlation_threshold + ) + + return results + + @classmethod + def categorize_columns(cls, df: pd.DataFrame) -> Dict[str, List[str]]: + """ + Categorize DataFrame columns by their data types. + + Args: + df: Input DataFrame + + Returns: + Dictionary mapping column types to lists of column names + """ + result = { + "numeric": [], + "categorical": [], + "datetime": [], + "text": [], + "other": [] + } + + for col in df.columns: + column = df[col] + + # Check if already datetime type - most reliable method + if pd.api.types.is_datetime64_any_dtype(column): + result["datetime"].append(col) + + # Check numeric types + elif pd.api.types.is_numeric_dtype(column) and not pd.api.types.is_bool_dtype(column): + result["numeric"].append(col) + + # Check categorical and boolean + elif pd.api.types.is_categorical_dtype(column) or pd.api.types.is_bool_dtype(column): + result["categorical"].append(col) + + # Check for text columns + elif pd.api.types.is_string_dtype(column) or pd.api.types.is_object_dtype(column): + # Check if more than 50% of non-null values are likely categorical (few unique values) + non_null_count = column.count() + if non_null_count > 0: + unique_ratio = column.nunique() / non_null_count + if unique_ratio < 0.2: # If less than 20% of values are unique, consider categorical + result["categorical"].append(col) + else: + result["text"].append(col) + else: + result["text"].append(col) + + # Everything else + else: + result["other"].append(col) + + # Verify all columns are categorized + categorized = [] + for category, cols in result.items(): + categorized.extend(cols) + + missing = set(df.columns) - set(categorized) + if missing: + print(f"Found uncategorized columns: {missing}") + result["other"].extend(list(missing)) + + return result + + @classmethod + def analyze_numeric_columns(cls, df: pd.DataFrame, numeric_columns: List[str]) -> Dict[str, Dict[str, Any]]: + """ + Analyze numeric columns to extract statistical information. + + Args: + df: Input DataFrame + numeric_columns: List of numeric column names + + Returns: + Dictionary mapping column names to their statistics + """ + result = {} + + for col in numeric_columns: + # Skip columns with all NaN values + if df[col].isna().all(): + continue + + stats = {} + + # Basic statistics + stats["count"] = int(df[col].count()) + stats["mean"] = float(df[col].mean()) + stats["median"] = float(df[col].median()) + stats["std"] = float(df[col].std()) + stats["min"] = float(df[col].min()) + stats["max"] = float(df[col].max()) + + # Calculate percentiles + for p in [25, 75, 90, 95, 99]: + stats[f"p{p}"] = float(df[col].quantile(p/100)) + + # Null value statistics + null_count = int(df[col].isna().sum()) + stats["null_count"] = null_count + stats["null_percentage"] = float((null_count / len(df)) * 100) + + result[col] = stats + + return result + + @classmethod + def analyze_categorical_columns(cls, df: pd.DataFrame, categorical_columns: List[str]) -> Dict[str, Dict[str, Any]]: + """ + Analyze categorical columns to extract distribution information. + + Args: + df: Input DataFrame + categorical_columns: List of categorical column names + + Returns: + Dictionary mapping column names to their statistics + """ + result = {} + + for col in categorical_columns: + # Skip columns with all NaN values + if df[col].isna().all(): + continue + + stats = {} + + # Basic statistics + stats["count"] = int(df[col].count()) + stats["unique_count"] = int(df[col].nunique()) + + # Value distribution (top 10 most common values) + value_counts = df[col].value_counts().head(10).to_dict() + # Convert any non-string keys to strings for JSON compatibility + top_values = {} + for k, v in value_counts.items(): + key = str(k) if not isinstance(k, str) else k + top_values[key] = int(v) + + stats["top_values"] = top_values + + # Calculate entropy to measure randomness + counts = df[col].value_counts() + probs = counts / counts.sum() + entropy = -np.sum(probs * np.log2(probs)) + stats["entropy"] = float(entropy) + + # Null value statistics + null_count = int(df[col].isna().sum()) + stats["null_count"] = null_count + stats["null_percentage"] = float((null_count / len(df)) * 100) + + result[col] = stats + + return result + + @classmethod + def analyze_datetime_columns(cls, df: pd.DataFrame, datetime_columns: List[str]) -> Dict[str, Dict[str, Any]]: + """ + Analyze datetime columns to extract temporal patterns. + + Args: + df: Input DataFrame + datetime_columns: List of datetime column names + + Returns: + Dictionary mapping column names to their statistics + """ + result = {} + + for col in datetime_columns: + # Skip columns with all NaN values + if df[col].isna().all(): + continue + + stats = {} + + # Basic statistics + stats["count"] = int(df[col].count()) + stats["min"] = str(df[col].min()) + stats["max"] = str(df[col].max()) + + # Calculate temporal span + min_date = df[col].min() + max_date = df[col].max() + if pd.notna(min_date) and pd.notna(max_date): + span_days = (max_date - min_date).total_seconds() / (60 * 60 * 24) + stats["span_days"] = float(span_days) + + # Extract date parts distribution + date_parts = {} + + # Year distribution + if df[col].dt.year.nunique() > 1: + year_counts = df[col].dt.year.value_counts().to_dict() + date_parts["year"] = {str(k): int(v) for k, v in year_counts.items()} + + # Month distribution + month_counts = df[col].dt.month.value_counts().to_dict() + date_parts["month"] = {str(k): int(v) for k, v in month_counts.items()} + + # Day of week distribution + dow_counts = df[col].dt.dayofweek.value_counts().to_dict() + date_parts["day_of_week"] = {str(k): int(v) for k, v in dow_counts.items()} + + # Hour distribution (if time component exists) + if (df[col].dt.hour != 0).any(): + hour_counts = df[col].dt.hour.value_counts().to_dict() + date_parts["hour"] = {str(k): int(v) for k, v in hour_counts.items()} + + stats["date_parts"] = date_parts + + # Null value statistics + null_count = int(df[col].isna().sum()) + stats["null_count"] = null_count + stats["null_percentage"] = float((null_count / len(df)) * 100) + + result[col] = stats + + return result + + @classmethod + def analyze_cross_row_relationships(cls, df: pd.DataFrame) -> Dict[str, Any]: + """ + Analyze relationships across rows, such as duplicates and null patterns. + + Args: + df: Input DataFrame + + Returns: + Dictionary containing cross-row relationship information + """ + result = {} + + # Analyze duplicates + duplicates = df.duplicated() + duplicate_count = int(duplicates.sum()) + duplicate_percentage = float((duplicate_count / len(df)) * 100) + + result["duplicates"] = { + "count": duplicate_count, + "percentage": duplicate_percentage + } + + # Analyze rows with null values + rows_with_null = df.isna().any(axis=1) + null_rows_count = int(rows_with_null.sum()) + null_rows_percentage = float((null_rows_count / len(df)) * 100) + + result["null_rows"] = { + "count": null_rows_count, + "percentage": null_rows_percentage + } + + return result + + @classmethod + def analyze_cross_column_relationships( + cls, df: pd.DataFrame, numeric_columns: List[str], correlation_threshold: float + ) -> Dict[str, Any]: + """ + Analyze relationships between columns, such as correlations. + + Args: + df: Input DataFrame + numeric_columns: List of numeric column names + correlation_threshold: Threshold for identifying strong correlations + + Returns: + Dictionary containing cross-column relationship information + """ + result = {} + + # Calculate correlations between numeric columns + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + corr_matrix = df[numeric_columns].corr() + + # Extract strong correlations (ignore self-correlations) + strong_correlations = {} + for i in range(len(numeric_columns)): + for j in range(i+1, len(numeric_columns)): + col1 = numeric_columns[i] + col2 = numeric_columns[j] + corr_value = corr_matrix.iloc[i, j] + + # Skip NaN correlations + if pd.isna(corr_value): + continue + + # Store absolute correlation values above threshold + if abs(corr_value) >= correlation_threshold: + pair_name = f"{col1} - {col2}" + strong_correlations[pair_name] = float(corr_value) + + if strong_correlations: + result["correlations"] = strong_correlations + + return result \ No newline at end of file diff --git a/app/core/data_loader.py b/app/core/data_loader.py new file mode 100644 index 0000000..6a7fbc4 --- /dev/null +++ b/app/core/data_loader.py @@ -0,0 +1,160 @@ +import pandas as pd +import numpy as np +import json +import os +import warnings +from pathlib import Path +from typing import Optional, Union + +class DataLoader: + """Load arbitrary tabular data into a DataFrame with robust error handling.""" + + @staticmethod + def load(path: str, sample_rows: int = 100000) -> pd.DataFrame: + """ + Load data from various file formats into a pandas DataFrame. + + Args: + path: Path to the data file + sample_rows: Maximum number of rows to load for large files + + Returns: + pandas DataFrame with the loaded data + """ + # Validate the path exists + if not os.path.exists(path): + raise FileNotFoundError(f"File not found: {path}") + + # Get file extension + ext = Path(path).suffix.lower() + + try: + if ext == ".csv": + # Try different encoding and delimiter options + try: + df = pd.read_csv(path, encoding='utf-8') + except: + try: + df = pd.read_csv(path, encoding='latin1') + except: + try: + # Try with different delimiters + df = pd.read_csv(path, sep=None, engine='python') + except: + # Last resort - try reading with very permissive settings + df = pd.read_csv(path, sep=None, engine='python', + encoding='latin1', on_bad_lines='skip') + elif ext == ".tsv": + df = pd.read_csv(path, sep='\t') + elif ext == ".json": + # Try multiple JSON formats + try: + # Try JSONL format first + df = pd.read_json(path, lines=True) + except ValueError: + try: + # Then try normal JSON + df = pd.read_json(path) + except: + # Try loading as raw JSON and converting + with open(path, 'r', encoding='utf-8') as f: + data = json.load(f) + if isinstance(data, list): + df = pd.DataFrame(data) + elif isinstance(data, dict): + # If it's a dict, try to extract a list or convert the dict itself + for k, v in data.items(): + if isinstance(v, list) and len(v) > 0: + df = pd.DataFrame(v) + break + else: + df = pd.DataFrame([data]) + else: + raise ValueError(f"Unsupported JSON structure in {path}") + elif ext in (".xls", ".xlsx"): + try: + # Try with openpyxl first + df = pd.read_excel(path, engine="openpyxl") + except: + # Fall back to xlrd for older Excel files + df = pd.read_excel(path) + elif ext == ".xlsb": + df = pd.read_excel(path, engine="pyxlsb") + elif ext == ".parquet": + df = pd.read_parquet(path) + elif ext == ".feather": + df = pd.read_feather(path) + elif ext == ".pickle" or ext == ".pkl": + df = pd.read_pickle(path) + elif ext == ".sas7bdat": + df = pd.read_sas(path) + elif ext == ".dta": + df = pd.read_stata(path) + elif ext == ".h5" or ext == ".hdf5": + df = pd.read_hdf(path) + else: + raise ValueError(f"Unsupported file extension: {ext}") + + # Clean up the DataFrame + # Replace infinite values with NaN + df = df.replace([np.inf, -np.inf], np.nan) + + # Handle duplicate column names + if df.columns.duplicated().any(): + df.columns = [f"{col}_{i}" if i > 0 else col + for i, col in enumerate(df.columns)] + + # Keep memory/latency bounded + if len(df) > sample_rows: + df = df.sample(sample_rows, random_state=42) + + # Process column types + df = DataLoader.infer_dtypes(df) + + return df.reset_index(drop=True) + + except Exception as e: + print(f"Error loading data from {path}: {str(e)}") + # Return an empty DataFrame with a message column + return pd.DataFrame({"error_message": [f"Failed to load data: {str(e)}"]}) + + @staticmethod + def parse_datetime(series): + """ + Parse datetime with appropriate format while suppressing warnings. + """ + # Skip if already datetime + if pd.api.types.is_datetime64_any_dtype(series): + return series + + # Suppress warnings and use dateutil parser + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + return pd.to_datetime(series, errors='coerce') + + @staticmethod + def infer_dtypes(df: pd.DataFrame) -> pd.DataFrame: + """Attempt to infer correct data types for all columns.""" + for col in df.columns: + # Skip columns that are already numeric or datetime + if pd.api.types.is_numeric_dtype(df[col]) or pd.api.types.is_datetime64_any_dtype(df[col]): + continue + + # Try to convert to numeric + numeric_series = pd.to_numeric(df[col], errors='coerce') + if numeric_series.notna().sum() > 0.8 * df[col].count(): # Over 80% valid numbers + df[col] = numeric_series + continue + + # Try to convert to datetime - with warnings suppressed + try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + datetime_series = pd.to_datetime(df[col], errors='coerce') + if datetime_series.notna().sum() > 0.8 * df[col].count(): # Over 80% valid dates + df[col] = datetime_series + continue + except: + pass + + return df \ No newline at end of file diff --git a/app/core/summary_formatter.py b/app/core/summary_formatter.py new file mode 100644 index 0000000..aea0406 --- /dev/null +++ b/app/core/summary_formatter.py @@ -0,0 +1,51 @@ +import json +import pandas as pd +import numpy as np +from typing import Dict, Any, Optional + +class SummaryFormatter: + """Build XML‑ish blocks for prompt ingestion with error handling.""" + + @staticmethod + def first_rows_block(df: pd.DataFrame, n: int = 10) -> str: + """Generate a CSV snippet of the first n rows.""" + try: + # Handle potential issues with object serialization + # Replace problematic values with their string representation + safe_df = df.head(n).copy() + + for col in safe_df.columns: + # Replace problematic values in object columns + if safe_df[col].dtype == 'object': + safe_df[col] = safe_df[col].apply(lambda x: + str(x) if x is not None else None) + + # Use CSV repr so models instantly see delimiters + return safe_df.to_csv(index=False) + except Exception as e: + return f"Error generating CSV snippet: {str(e)}\n" + + @staticmethod + def json_block(summary: Dict[str, Any]) -> str: + """Convert summary dict to a JSON string, handling problematic values.""" + try: + # Handle non-serializable objects + def clean_for_json(obj): + if isinstance(obj, dict): + return {k: clean_for_json(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [clean_for_json(item) for item in obj] + elif isinstance(obj, (int, float, str, bool, type(None))): + return obj + elif isinstance(obj, np.number): + return float(obj) + else: + return str(obj) + + # Clean the summary dict + clean_summary = clean_for_json(summary) + + # Convert to JSON + return json.dumps(clean_summary, separators=(",", ":"), ensure_ascii=False) + except Exception as e: + return f"Error generating JSON summary: {str(e)}" \ No newline at end of file From d0fb070d3957f0aa3fe1f6fc9da3912e573d79b1 Mon Sep 17 00:00:00 2001 From: Keivan Vosoughi Date: Thu, 17 Apr 2025 14:00:33 -0700 Subject: [PATCH 10/10] Initial Checkin for free form data generation Add Free Form Workflow Type Add Free From Prompt Add use case for free form workflow type Usecase fix for freeform Show Seed Instructions for FreeForm Ftech File Content Add AG Grid Add Modules Add Themes Adding FreeFormTable Fix Examples Table for Freeform Dataset Fix Dataset Details Page for Freeforms Add Dataset Viewer Hide Exmaples Buttons for Freeforms Fix for Examples in Dataset Details Page (freeform) Update Examples Message Fix for very long seeds Fix for Dataset Details Page Fix for Freeform Table in Results Page Adding Re-generate Dataset Changes Add Evaluation for Freeforms Fix for custom prompt modal Fix Lint Errors --- app/client/eslint.config.js | 1 + app/client/package.json | 2 + app/client/src/Container.tsx | 3 - app/client/src/api/api.ts | 11 +- app/client/src/api/hooks.ts | 4 +- .../src/components/RouteAccessControl.tsx | 2 +- .../src/components/TelemetryDashboard.tsx | 6 +- .../src/pages/DataGenerator/Configure.tsx | 45 ++++- .../DataGenerator/CustomPromptButton.tsx | 40 ++++- .../src/pages/DataGenerator/DataGenerator.tsx | 17 +- .../src/pages/DataGenerator/Examples.tsx | 166 ++++++++++++++++-- .../DataGenerator/FileSelectorButton.tsx | 8 +- .../src/pages/DataGenerator/FilesTable.tsx | 11 +- app/client/src/pages/DataGenerator/Finish.tsx | 36 +++- .../DataGenerator/FreeFormExampleTable.tsx | 127 ++++++++++++++ .../src/pages/DataGenerator/FreeFormTable.tsx | 139 +++++++++++++++ .../pages/DataGenerator/PCModalContent.tsx | 2 +- .../src/pages/DataGenerator/Parameters.tsx | 8 +- app/client/src/pages/DataGenerator/Prompt.tsx | 9 +- .../pages/DataGenerator/SeedResultTable.tsx | 2 +- .../src/pages/DataGenerator/Summary.tsx | 12 +- .../src/pages/DataGenerator/constants.ts | 3 +- app/client/src/pages/DataGenerator/hooks.ts | 38 ++-- app/client/src/pages/DataGenerator/types.ts | 8 +- app/client/src/pages/DataGenerator/utils.ts | 32 ++++ .../pages/DatasetDetails/ConfigurationTab.tsx | 5 +- .../DatasetDetails/CustomGenerationTable.tsx | 4 +- .../DatasetDetails/DatasetDetailsPage.tsx | 17 +- .../DatasetDetails/DatasetGenerationTab.tsx | 12 +- .../DatasetGenerationTopics.tsx | 36 +--- .../pages/DatasetDetails/DatasetViewer.tsx | 44 +++++ .../src/pages/DatasetDetails/ExampleModal.tsx | 2 +- .../pages/DatasetDetails/ExamplesSection.tsx | 51 +----- .../DatasetDetails/TopicGenerationTable.tsx | 4 +- .../src/pages/DatasetDetails/constants.tsx | 2 - .../EvaluationConfigurationTab.tsx | 2 - .../EvaluationGenerationTab.tsx | 20 ++- .../src/pages/EvaluationDetails/hooks.ts | 6 +- .../pages/Evaluator/EvaluateExampleTable.tsx | 6 +- .../src/pages/Evaluator/EvaluatorPage.tsx | 24 +-- .../src/pages/Evaluator/EvaluatorSuccess.tsx | 23 ++- .../Evaluator/FreeFromEvaluationTable.tsx | 140 +++++++++++++++ .../Evaluator/GeneratedEvaluationModal.tsx | 4 +- .../src/pages/Evaluator/ReevaluatorPage.tsx | 8 +- .../src/pages/Evaluator/SeedEvaluateTable.tsx | 6 +- app/client/src/pages/Home/DatasetsTab.tsx | 4 +- app/client/src/pages/Home/EvaluateButton.tsx | 3 +- app/client/src/pages/Home/EvaluationsTab.tsx | 2 +- app/client/src/pages/Home/hooks.ts | 4 +- app/client/src/utils/sortutils.ts | 2 +- build/shell_scripts/build_client.sh | 1 + 51 files changed, 909 insertions(+), 255 deletions(-) create mode 100644 app/client/src/pages/DataGenerator/FreeFormExampleTable.tsx create mode 100644 app/client/src/pages/DataGenerator/FreeFormTable.tsx create mode 100644 app/client/src/pages/DatasetDetails/DatasetViewer.tsx create mode 100644 app/client/src/pages/Evaluator/FreeFromEvaluationTable.tsx diff --git a/app/client/eslint.config.js b/app/client/eslint.config.js index 092408a..6a991c7 100644 --- a/app/client/eslint.config.js +++ b/app/client/eslint.config.js @@ -23,6 +23,7 @@ export default tseslint.config( 'warn', { allowConstantExport: true }, ], + '@typescript-eslint/no-explicit-any': ['warn', { 'fixToUnknown': true, 'ignoreRestArgs': false }] }, }, ) diff --git a/app/client/package.json b/app/client/package.json index f3e0095..bf69255 100644 --- a/app/client/package.json +++ b/app/client/package.json @@ -16,6 +16,8 @@ "@mui/icons-material": "6.1.7", "@mui/material": "6.1.7", "@tanstack/react-query": "5.66.0", + "ag-grid-community": "33.2.4", + "ag-grid-react":"33.2.4", "antd": "5.22.1", "axios": "1.6.7", "lodash": "4.17.21", diff --git a/app/client/src/Container.tsx b/app/client/src/Container.tsx index 94db670..f881d2c 100644 --- a/app/client/src/Container.tsx +++ b/app/client/src/Container.tsx @@ -48,9 +48,6 @@ const PageHeader = styled(Header)` height: fit-content; padding: 5px 15px `; -const StyledImg = styled.img` - height: ${props => props?.height && `${props.height}px`} -` const StyledText = styled.div` font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, 'Noto Sans', sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol', 'Noto Color Emoji'; diff --git a/app/client/src/api/api.ts b/app/client/src/api/api.ts index 19d9073..b343957 100644 --- a/app/client/src/api/api.ts +++ b/app/client/src/api/api.ts @@ -27,8 +27,11 @@ export const useFetchModels = (): UseFetchApiReturn => { return useFetch(url); } -export const useFetchDefaultPrompt = (useCase: string): UseFetchApiReturn => { - const url = `${baseUrl}/${isEmpty(useCase) ? 'custom' : useCase}/gen_prompt`; +export const useFetchDefaultPrompt = (useCase: string, workflowType?: WorkerType): UseFetchApiReturn => { + let url = `${baseUrl}/${isEmpty(useCase) ? 'custom' : useCase}/gen_prompt`; + if (workflowType && workflowType === 'freeform') { + url = `${baseUrl}/${isEmpty(useCase) ? 'custom' : useCase}/gen_freeform_prompt`; + } return useFetch(url); } @@ -42,7 +45,7 @@ export const useFetchDefaultModelParams = (): UseFetchApiReturn() => { - const genDatasetUrl = `${import.meta.env.VITE_AMP_URL}/synthesis/generate`; +export const useTriggerDatagen = (workflow_type: string) => { + const genDatasetUrl = `${import.meta.env.VITE_AMP_URL}/synthesis/${workflow_type === 'freeform' ? 'freeform' : 'generate'}`; return usePostApi(genDatasetUrl); } diff --git a/app/client/src/api/hooks.ts b/app/client/src/api/hooks.ts index e233f18..81dc16a 100644 --- a/app/client/src/api/hooks.ts +++ b/app/client/src/api/hooks.ts @@ -106,7 +106,7 @@ interface UsePostApiReturn { data: T | null; loading: boolean; error: Error | null; - triggerPost: (body: Record) => Promise; + triggerPost: (body: Record) => Promise; } export function usePostApi(url: string): UsePostApiReturn { @@ -114,7 +114,7 @@ export function usePostApi(url: string): UsePostApiReturn { const [loading, setLoading] = useState(false); const [error, setError] = useState(null); - const triggerPost = async (body: Record) => { + const triggerPost = async (body: Record) => { setLoading(true); setError(null); // Reset error on each request diff --git a/app/client/src/components/RouteAccessControl.tsx b/app/client/src/components/RouteAccessControl.tsx index dbd46b3..dc8242d 100644 --- a/app/client/src/components/RouteAccessControl.tsx +++ b/app/client/src/components/RouteAccessControl.tsx @@ -10,7 +10,7 @@ import { Navigate, useLocation } from "react-router-dom"; */ interface RouteACProps{ element: ReactNode; - validator: (state: any | null) => boolean; + validator: (state: unknown | null) => boolean; reroutePath?: string; } const RouteAccessControl: FC = ({ element, validator, reroutePath = '/' }) => { diff --git a/app/client/src/components/TelemetryDashboard.tsx b/app/client/src/components/TelemetryDashboard.tsx index b154186..27d7269 100644 --- a/app/client/src/components/TelemetryDashboard.tsx +++ b/app/client/src/components/TelemetryDashboard.tsx @@ -1,7 +1,7 @@ import React, { useState, useEffect } from 'react'; import { BarChart, Bar, XAxis, YAxis, CartesianGrid, Tooltip, Legend, ResponsiveContainer, - LineChart, Line, AreaChart, Area + LineChart, Line } from 'recharts'; import axios from 'axios'; import { @@ -9,7 +9,7 @@ import { } from 'antd'; import { DashboardOutlined, ApiOutlined, CloudServerOutlined, RocketOutlined, SyncOutlined, - PieChartOutlined, BarChartOutlined, CodeOutlined, WarningOutlined, CheckCircleOutlined, CloseCircleOutlined + CodeOutlined, WarningOutlined, CheckCircleOutlined, CloseCircleOutlined } from '@ant-design/icons'; const { Title, Text } = Typography; @@ -19,7 +19,7 @@ const SUCCESS_COLOR = '#52c41a'; const ERROR_COLOR = '#f5222d'; const WARNING_COLOR = '#faad14'; const INFO_COLOR = '#1890ff'; -const COLORS = ['#0088FE', '#00C49F', '#FFBB28', '#FF8042', '#8884d8', '#4CAF50', '#F44336', '#9C27B0']; +// const COLORS = ['#0088FE', '#00C49F', '#FFBB28', '#FF8042', '#8884d8', '#4CAF50', '#F44336', '#9C27B0']; const TelemetryDashboard = () => { const [loading, setLoading] = useState(true); diff --git a/app/client/src/pages/DataGenerator/Configure.tsx b/app/client/src/pages/DataGenerator/Configure.tsx index 2db6fd4..cde2cab 100644 --- a/app/client/src/pages/DataGenerator/Configure.tsx +++ b/app/client/src/pages/DataGenerator/Configure.tsx @@ -1,5 +1,6 @@ import endsWith from 'lodash/endsWith'; import isEmpty from 'lodash/isEmpty'; +import isFunction from 'lodash/isFunction'; import { useEffect, useState } from 'react'; import { Flex, Form, Input, Select, Typography } from 'antd'; import styled from 'styled-components'; @@ -10,6 +11,7 @@ import { ModelProviders, ModelProvidersDropdownOpts } from './types'; import { useWizardCtx } from './utils'; import FileSelectorButton from './FileSelectorButton'; + const StepContainer = styled(Flex)` background: white; padding: 40px 0px; @@ -31,7 +33,8 @@ export const USECASE_OPTIONS = [ export const WORKFLOW_OPTIONS = [ { label: 'Supervised Fine-Tuning', value: 'supervised-fine-tuning' }, - { label: 'Custom Data Generation', value: 'custom' } + { label: 'Custom Data Generation', value: 'custom' }, + { label: 'Freefrom Data Generation', value: 'freeform' } ]; export const MODEL_TYPE_OPTIONS: ModelProvidersDropdownOpts = [ @@ -55,16 +58,23 @@ const Configure = () => { delete values.output_value; const allFieldsFilled = Object.values(values).every(value => Boolean(value)); - if (allFieldsFilled) { - setIsStepValid && setIsStepValid(true) - } else { - setIsStepValid && setIsStepValid(false) + if (allFieldsFilled && isFunction(setIsStepValid)) { + setIsStepValid(true) + } else if (isFunction(setIsStepValid)) { + setIsStepValid(false) } } useEffect(() => { validateForm() }, [form, formData]) + // keivan + useEffect(() => { + if (formData && formData?.inference_type === undefined) { + form.setFieldValue('inference_type', ModelProviders.CAII); + } + }, [formData]); + const labelCol = { span: 8 }; @@ -83,7 +93,7 @@ const Configure = () => { form.setFieldValue('doc_paths', paths); } - const onFilesChange = (selections: any) => { + const onFilesChange = (selections: unknown) => { if (Array.isArray(selections) && !isEmpty(selections)) { const paths = selections.map((file: File) => ( { @@ -106,7 +116,6 @@ const Configure = () => { setSelectedFiles([]); } } - return ( @@ -209,7 +218,8 @@ const Configure = () => { )} - {formData?.workflow_type === WorkflowType.SUPERVISED_FINE_TUNING && + {(formData?.workflow_type === WorkflowType.SUPERVISED_FINE_TUNING || + formData?.workflow_type === WorkflowType.FREE_FORM_DATA_GENERATION) && { formData?.workflow_type === WorkflowType.CUSTOM_DATA_GENERATION) && { } + {/* {formData?.workflow_type === WorkflowType.FREE_FORM_DATA_GENERATION || + + + + + + } */} ) diff --git a/app/client/src/pages/DataGenerator/CustomPromptButton.tsx b/app/client/src/pages/DataGenerator/CustomPromptButton.tsx index e0696f1..dc2fa47 100644 --- a/app/client/src/pages/DataGenerator/CustomPromptButton.tsx +++ b/app/client/src/pages/DataGenerator/CustomPromptButton.tsx @@ -1,9 +1,8 @@ -import { Button, Flex, Form, Input, Modal, notification, Spin } from "antd"; +import { Button, Form, Input, Modal, notification } from "antd"; import { useEffect, useState } from "react"; import { useMutation } from "@tanstack/react-query"; import styled from "styled-components"; -import { LoadingOutlined } from '@ant-design/icons'; -import { fetchCustomPrompt, fetchPrompt } from "./hooks"; +import { fetchCustomPrompt } from "./hooks"; import Loading from "../Evaluator/Loading"; interface Props { @@ -16,9 +15,30 @@ interface Props { export const StyledTextArea = styled(Input.TextArea)` margin-bottom: 10px !important; - min-height: 175px !important; + min-height: 275px !important; + margin-bottom: 10px !important; + padding: 15px 20px !important; `; +const StyledModal = styled(Modal)` + .ant-modal-content { + max-height: 90vh; + // height: 760px; + height: 85vh; + width: 750px; + .ant-modal-body { + padding-top: 0; + min-height: 70vh; + } + } + // .ant-modal-content { + // border-radius: 8px; + // box-shadow: 0px 4px 16px rgba(0, 0, 0, 0.1); + // background-color: #ffffff; + // padding: 24px; + // } +` + const CustomPromptButton: React.FC = ({ model_id, inference_type, caii_endpoint, use_case, setPrompt }) => { const [form] = Form.useForm(); const [showModal, setShowModal] = useState(false); @@ -39,7 +59,7 @@ const CustomPromptButton: React.FC = ({ model_id, inference_type, caii_en setShowModal(false); } }, [mutation.error, mutation.isSuccess]); - + const onFinish = async () => { const custom_prompt = form.getFieldValue('custom_prompt_instructions'); try { @@ -67,7 +87,7 @@ const CustomPromptButton: React.FC = ({ model_id, inference_type, caii_en {showModal && ( - = ({ model_id, inference_type, caii_en initialValues={initialValues} onFinish={onSubmit} style={{ marginTop: '24px' }} - disabled={mutation.isLoading} + disabled={mutation.isPending} > - {mutation.isLoading && + {mutation.isPending && } @@ -90,6 +110,8 @@ const CustomPromptButton: React.FC = ({ model_id, inference_type, caii_en name='custom_prompt_instructions' label='Custom Prompt Instructions' rules={[{ required: true, message: "This field is required." }]} + labelCol={{ span: 24 }} + wrapperCol={{ span: 24 }} > = ({ model_id, inference_type, caii_en - + ) } diff --git a/app/client/src/pages/DataGenerator/DataGenerator.tsx b/app/client/src/pages/DataGenerator/DataGenerator.tsx index 62a545b..f4d6472 100644 --- a/app/client/src/pages/DataGenerator/DataGenerator.tsx +++ b/app/client/src/pages/DataGenerator/DataGenerator.tsx @@ -2,7 +2,7 @@ import isEmpty from 'lodash/isEmpty'; import isString from 'lodash/isString'; import { useRef, useState } from 'react'; import { useLocation } from 'react-router-dom'; -import { Button, Flex, Form, Layout, Steps, Typography } from 'antd'; +import { Button, Flex, Form, Layout, Steps } from 'antd'; import type { FormInstance } from 'antd'; import ArrowBackIcon from '@mui/icons-material/ArrowBack'; import ArrowForwardIcon from '@mui/icons-material/ArrowForward'; @@ -17,6 +17,7 @@ import Finish from './Finish'; import { DataGenWizardSteps, WizardStepConfig, WorkflowType } from './types'; import { WizardCtx } from './utils'; +import { useGetDatasetDetails } from '../DatasetDetails/hooks'; const { Content } = Layout; // const { Title } = Typography; @@ -98,10 +99,14 @@ const DataGenerator = () => { const [isStepValid, setIsStepValid] = useState(false); // Data passed from listing table to prepopulate form const location = useLocation(); - console.log('DatGenerator >> location?.state?.data:', location?.state?.data); + console.log('location?.state?.data:', location?.state?.data); const initialData = location?.state?.data; + + const datasetDetailsReq = location?.state?.data && useGetDatasetDetails(location?.state?.data?.generate_file_name) if (initialData?.technique) { - initialData.workflow_type = initialData?.technique === 'sft' ? WorkflowType.SUPERVISED_FINE_TUNING : + initialData.workflow_type = initialData?.technique === 'sft' ? + WorkflowType.SUPERVISED_FINE_TUNING : + initialData?.technique === 'freeform' ? WorkflowType.FREE_FORM_DATA_GENERATION : WorkflowType.CUSTOM_DATA_GENERATION; } if (Array.isArray(initialData?.doc_paths) && !isEmpty(initialData?.doc_paths) ) { @@ -111,6 +116,12 @@ const DataGenerator = () => { })); } + + // if (datasetDetailsReq && datasetDetailsReq.data && + // !isEmpty(datasetDetailsReq?.data?.generate_file_name)) { + // initialData.example_path = initialData?.example_path; + // } + if (Array.isArray(initialData?.input_paths) && !isEmpty(initialData?.input_paths) ) { initialData.doc_paths = initialData?.input_paths.map((path: string) => ({ value: path, diff --git a/app/client/src/pages/DataGenerator/Examples.tsx b/app/client/src/pages/DataGenerator/Examples.tsx index 2864ba6..b64fbb7 100644 --- a/app/client/src/pages/DataGenerator/Examples.tsx +++ b/app/client/src/pages/DataGenerator/Examples.tsx @@ -1,10 +1,21 @@ -import { Button, Form, Modal, Space, Table, Tooltip, Typography, Flex } from 'antd'; -import { DeleteOutlined, EditOutlined } from '@ant-design/icons'; +import first from 'lodash/first'; +import get from 'lodash/get'; +import isEmpty from 'lodash/isEmpty'; +import React, { useEffect } from 'react'; +import { Button, Form, Modal, Space, Table, Tooltip, Typography, Flex, Input, Empty } from 'antd'; +import { CloudUploadOutlined, DeleteOutlined, EditOutlined } from '@ant-design/icons'; import styled from 'styled-components'; +import { useMutation } from "@tanstack/react-query"; import { useFetchExamples } from '../../api/api'; import TooltipIcon from '../../components/TooltipIcon'; import PCModalContent from './PCModalContent'; -import { QuestionSolution } from './types'; +import { File, QuestionSolution, WorkflowType } from './types'; +import FileSelectorButton from './FileSelectorButton'; + +import { fetchFileContent } from './hooks'; +import { useState } from 'react'; +import FreeFormExampleTable from './FreeFormExampleTable'; +import { useWizardCtx } from './utils'; const { Title } = Typography; const Container = styled.div` @@ -25,20 +36,66 @@ const StyledTable = styled(Table)` cursor: pointer; } ` + +const StyledContainer = styled.div` + margin-bottom: 24px; + height: 48px; + color: rgba(0, 0, 0, 0.45); + svg { + font-size: 48px; + } + +`; + const MAX_EXAMPLES = 5; -const Examples = () => { +enum ExampleType { + FREE_FORM = 'freeform', + PROMPT_COMPLETION = 'promptcompletion' +} + +const Examples: React.FC = () => { const form = Form.useFormInstance(); - // const { setIsStepValid } = useWizardCtx(); - // const _values = Form.useWatch('examples', form); - // useEffect (() => { - // const values = form.getFieldsValue(); - // if (isEmpty(values.examples)) { - // setIsStepValid(false); - // } else if (!isEmpty(values?.examples)) { - // setIsStepValid(true); - // } - // }, [_values]); + const [exampleType, setExampleType] = useState(ExampleType.PROMPT_COMPLETION); + + const mutation = useMutation({ + mutationFn: fetchFileContent + }); + const values = form.getFieldsValue(true) + + useEffect(() => { + const example_path = form.getFieldValue('example_path'); + + if (!isEmpty(example_path)) { + mutation.mutate({ + path: example_path + }); + } + + if (form.getFieldValue('workflow_type') === 'freeform') { + setExampleType(ExampleType.FREE_FORM); + } + + + + }, [form.getFieldValue('example_path'), form.getFieldValue('workflow_type')]); + + useEffect(() => { + if (!isEmpty(mutation.data)) { + form.setFieldValue('examples', mutation.data); + } + }, [mutation.data]); + + const { setIsStepValid } = useWizardCtx(); + const _values = Form.useWatch(['examples', 'example_path'], form); + useEffect (() => { + const values = form.getFieldsValue(); + if (isEmpty(values.examples) && isEmpty(form.getFieldValue('example_path'))) { + setIsStepValid(false); + } else { + setIsStepValid(true); + } + }, [_values, form.getFieldValue('example_path')]); const columns = [ { @@ -141,6 +198,26 @@ const Examples = () => { form.setFieldValue('examples', examples.examples) } const rowLimitReached = form.getFieldValue('examples')?.length === MAX_EXAMPLES; + const workflowType = form.getFieldValue('workflow_type'); + + const onAddFiles = (files: File[]) => { + if (!isEmpty (files)) { + const file = first(files); + mutation.mutate({ + path: get(file, '_path'), + }); + const values = form.getFieldsValue(); + form.setFieldsValue({ + ...values, + example_path: get(file, '_path') + }); + setExampleType(ExampleType.FREE_FORM); + } + } + + const labelCol = { + span: 10 + }; return ( @@ -151,7 +228,26 @@ const Examples = () => { - + + {workflowType === WorkflowType.FREE_FORM_DATA_GENERATION && + <> + + + + + + } + + {exampleType !== ExampleType.FREE_FORM && + } + + {exampleType !== ExampleType.FREE_FORM && - + } + {exampleType === ExampleType.FREE_FORM && !isEmpty(mutation.data) && + } + {exampleType === ExampleType.FREE_FORM && isEmpty(mutation.data) && !isEmpty(values.examples) && + } + {exampleType === ExampleType.FREE_FORM && isEmpty(mutation.data) && isEmpty(values.examples) && + + + + } + imageStyle={{ + height: 60, + marginBottom: 24 + }} + description={ + <> +

+ Upload a JSON file containing examples +

+

+ {'Examples should be in the format of a JSON array containing array of key & value pairs. The key should be the column name and the value should be the cell value.'} +

+ + } + > +
+ } + {exampleType !== ExampleType.FREE_FORM && @@ -230,7 +358,7 @@ const Examples = () => { rowClassName={() => 'hover-pointer'} rowKey={(_record, index) => `examples-table-${index}`} /> - + }
) diff --git a/app/client/src/pages/DataGenerator/FileSelectorButton.tsx b/app/client/src/pages/DataGenerator/FileSelectorButton.tsx index fd612cf..b8e6f88 100644 --- a/app/client/src/pages/DataGenerator/FileSelectorButton.tsx +++ b/app/client/src/pages/DataGenerator/FileSelectorButton.tsx @@ -1,4 +1,3 @@ -import get from 'lodash/get'; import { Button, Modal } from 'antd'; import React, { useState } from 'react'; import FilesTable from './FilesTable'; @@ -9,9 +8,10 @@ import { File, WorkflowType } from './types'; interface Props { onAddFiles: (files: File[]) => void; workflowType: WorkflowType; + label?: string; } -const FileSelectorButton: React.FC = ({ onAddFiles, workflowType }) => { +const FileSelectorButton: React.FC = ({ onAddFiles, workflowType, label }) => { const [showModal, setShowModal] = useState(false); const [selectedFiles, setSelectedFiles] = useState([]) @@ -31,7 +31,9 @@ const FileSelectorButton: React.FC = ({ onAddFiles, workflowType }) => { style={{ marginLeft: '4px' }} onClick={() => setShowModal(true)} icon={} - /> + > + {label ? label : null} + {showModal && ( = ({ onSelectedRows, workflowType }) => { const [paths, setPaths] = useState(null); const [path, setPath] = useState(null); const [selectedRowKeys, setSelectedRowKeys] = useState([]); - const [selectedRows, setSelectedRows] = useState([]); + const [, setSelectedRows] = useState([]); // row selection map: path as key -> list of row keys const [rowSelectionMap, setRowSelectionMap] = useState({}); // row selection map: path as key -> list of files const [fileSelectionMap, setFileSelectionMap] = useState({}); - const { fetching, listProjectFiles, data } = useGetProjectFiles(paths || []); + const { fetching, listProjectFiles, data } = useGetProjectFiles(); useEffect(() => { if (!isEmpty(path) || paths === null || isEmpty(paths)) { @@ -151,7 +153,7 @@ const FilesTable: React.FC = ({ onSelectedRows, workflowType }) => { key: 'name', ellipsis: true, render: (file: File) => { - const { name, url } = file; + const { name } = file; if (file?.mime !== DIRECTORY_MIME_TYPE) { return ( @@ -205,6 +207,7 @@ const FilesTable: React.FC = ({ onSelectedRows, workflowType }) => { )} + {fetching && } diff --git a/app/client/src/pages/DataGenerator/Finish.tsx b/app/client/src/pages/DataGenerator/Finish.tsx index 00df57f..116bfb7 100644 --- a/app/client/src/pages/DataGenerator/Finish.tsx +++ b/app/client/src/pages/DataGenerator/Finish.tsx @@ -1,6 +1,7 @@ import isNumber from 'lodash/isNumber'; import filter from 'lodash/filter'; import isString from 'lodash/isString'; +import isEmpty from 'lodash/isEmpty'; import { FC, useEffect } from 'react'; import { HomeOutlined, PageviewOutlined } from '@mui/icons-material'; import AssessmentIcon from '@mui/icons-material/Assessment'; @@ -16,10 +17,10 @@ import { useTriggerDatagen } from './../../api/api' import { DEMO_MODE_THRESHOLD } from './constants' import { GenDatasetResponse, QuestionSolution, WorkflowType } from './types'; import { Pages } from '../../types'; -import { isEmpty } from 'lodash'; import CustomResultTable from './CustomResultTable'; import SeedResultTable from './SeedResultTable'; import { getFilesURL } from '../Evaluator/util'; +import FreeFormTable from './FreeFormTable'; const { Title } = Typography; @@ -126,9 +127,10 @@ const isDemoMode = (numQuestions: number, topics: [], form: FormInstance) => { const Finish = () => { const form = Form.useFormInstance(); - const { data: genDatasetResp, loading, error: generationError, triggerPost } = useTriggerDatagen(); - const { num_questions, topics } = form.getFieldsValue(true) - const isDemo = isDemoMode(num_questions, topics, form) + const { num_questions, topics, workflow_type } = form.getFieldsValue(true); + const { data: genDatasetResp, loading, error: generationError, triggerPost } = useTriggerDatagen(workflow_type); + + const isDemo = isDemoMode(num_questions, topics, form); useEffect(() => { const formValues = form.getFieldsValue(true); @@ -153,6 +155,8 @@ const Finish = () => { formValues.technique = 'sft'; } else if (formValues.workflow_type === WorkflowType.CUSTOM_DATA_GENERATION) { formValues.technique = 'custom_workflow'; + } else if (formValues.workflow_type === WorkflowType.FREE_FORM_DATA_GENERATION) { + formValues.technique = 'freeform'; } // send examples as null when the array is empty if (isEmpty(formValues.examples)) { @@ -173,11 +177,15 @@ const Finish = () => { formValues.doc_paths = doc_paths } + if (formValues.workflow_type === WorkflowType.FREE_FORM_DATA_GENERATION) { + delete formValues.examples; + } + const args = {...formValues, is_demo: isDemo, model_params: formValues.model_parameters } triggerPost(args) }, []); - const hasTopics = (genDatasetResp: any) => { + const hasTopics = (genDatasetResp: unknown) => { return !Array.isArray(genDatasetResp?.results) } @@ -192,13 +200,23 @@ const Finish = () => { let topicTabs = []; if (!hasDocSeeds && formValues.workflow_type !== WorkflowType.CUSTOM_DATA_GENERATION && - hasTopics(genDatasetResp)) { - topicTabs = genDatasetResp?.results && Object.keys(genDatasetResp.results).map((topic, i) => ({ + hasTopics(genDatasetResp) && !isEmpty(genDatasetResp?.results)) { + const values = Object.values(genDatasetResp?.results); + + + topicTabs = genDatasetResp?.results && Object.keys(genDatasetResp.results).map((topic, i) => { + return ({ key: `${topic}-${i}`, label: {topic}, value: topic, - children: - })); + children: workflow_type !== WorkflowType.FREE_FORM_DATA_GENERATION ? + : + // + + + + }) + }); } const nextStepsListPreview = [ diff --git a/app/client/src/pages/DataGenerator/FreeFormExampleTable.tsx b/app/client/src/pages/DataGenerator/FreeFormExampleTable.tsx new file mode 100644 index 0000000..c93bceb --- /dev/null +++ b/app/client/src/pages/DataGenerator/FreeFormExampleTable.tsx @@ -0,0 +1,127 @@ + +import isEmpty from 'lodash/isEmpty'; +import first from 'lodash/first'; +import toString from 'lodash/toString'; +import React, { FunctionComponent, useState, useMemo, useCallback, useEffect } from 'react'; +import { AgGridReact } from 'ag-grid-react'; + +// // Register all Community features +// // ModuleRegistry.registerModules([AllCommunityModule]); +import { themeMaterial } from "ag-grid-community"; + +import { + ModuleRegistry, + ClientSideRowModelModule, + ValidationModule, + type ColDef, + type GetRowIdFunc, + type GetRowIdParams + } from 'ag-grid-community'; + +import { TextFilterModule } from 'ag-grid-community'; +import { NumberFilterModule } from 'ag-grid-community'; +import { DateFilterModule } from 'ag-grid-community'; + +// Register all Community features (if needed, specify valid modules here) +ModuleRegistry.registerModules([ + // AllModules, + TextFilterModule, + NumberFilterModule, + DateFilterModule, + // SetFilterModule, + // MultiFilterModule, + // GroupFilterModule, + // CustomFilterModule, + + // ModuleRegistry, + // RowGroupingModule, + // PivotModule, + // TreeDataModule, + ClientSideRowModelModule, + ValidationModule +]); + +interface Props { + data: Record[]; +} + +const FreeFormExampleTable: FunctionComponent = ({ data }) => { + const [colDefs, setColDefs] = useState([]); + const [rowData, setRowData] = useState([]); + + useEffect(() => { + if (!isEmpty(data)) { + const columnNames = Object.keys(first(data)); + const columnDefs = columnNames.map((colName) => ({ + field: colName, + headerName: colName, + width: 250, + filter: true, + sortable: true, + resizable: true + })); + setColDefs(columnDefs); + setRowData(data); + } + } + , [data]); + + const defaultColDef: ColDef = useMemo( + () => ({ + flex: 1, + filter: true, + enableRowGroup: true, + enableValue: true, + + editable: true, + minWidth: 170 + }), + [] + ); + + let index = 0; + const getRowId = useCallback( + ({ data: { ticker } }: GetRowIdParams) => { + index++; + return ticker || toString(index); + }, + [] + ); + + const statusBar = useMemo( + () => ({ + statusPanels: [ + { statusPanel: "agTotalAndFilteredRowCountComponent" }, + { statusPanel: "agTotalRowCountComponent" }, + { statusPanel: "agFilteredRowCountComponent" }, + { statusPanel: "agSelectedRowCountComponent" }, + { statusPanel: "agAggregationComponent" }, + ], + }), + [] + ); + + + return ( + <> +
+ +
+ + ); +} +export default FreeFormExampleTable; \ No newline at end of file diff --git a/app/client/src/pages/DataGenerator/FreeFormTable.tsx b/app/client/src/pages/DataGenerator/FreeFormTable.tsx new file mode 100644 index 0000000..24fb9f8 --- /dev/null +++ b/app/client/src/pages/DataGenerator/FreeFormTable.tsx @@ -0,0 +1,139 @@ + +import isEmpty from 'lodash/isEmpty'; +import first from 'lodash/first'; +import toString from 'lodash/toString'; +import React, { FunctionComponent, useState, useMemo, useCallback, useEffect } from 'react'; +import { AgGridReact } from 'ag-grid-react'; +// // Register all Community features +// // ModuleRegistry.registerModules([AllCommunityModule]); +import { themeMaterial } from 'ag-grid-community'; + +import { + ModuleRegistry, + ClientSideRowModelModule, + ValidationModule, + type ColDef, + type GetRowIdFunc, + type GetRowIdParams + } from 'ag-grid-community'; + +// import { RowGroupingModule } from 'ag-grid-community'; +// import { PivotModule } from 'ag-grid-community'; +// import { TreeDataModule } from 'ag-grid-community'; +// import { ClientSideRowModelModule } from 'ag-grid-community'; +// import { AllModules } from 'ag-grid-community'; +import { TextFilterModule } from 'ag-grid-community'; +import { NumberFilterModule } from 'ag-grid-community'; +import { DateFilterModule } from 'ag-grid-community'; +// import { SetFilterModule } from 'ag-grid-community'; +// import { MultiFilterModule } from 'ag-grid-community'; +// import { GroupFilterModule } from 'ag-grid-community'; +// import { CustomFilterModule } from 'ag-grid-community'; + +// Register all Community features (if needed, specify valid modules here) +ModuleRegistry.registerModules([ + // AllModules, + TextFilterModule, + NumberFilterModule, + DateFilterModule, + ClientSideRowModelModule, + ValidationModule +]); + +interface Props { + data: Record[]; +} + +const FreeFormTable: FunctionComponent = ({ data }) => { + const [colDefs, setColDefs] = useState([]); + const [rowData, setRowData] = useState([]); + + useEffect(() => { + if (!isEmpty(data)) { + const columnNames = Object.keys(first(data)); + const columnDefs = columnNames.map((colName) => ({ + field: colName, + headerName: colName, + width: 250, + filter: true, + sortable: true, + resizable: true + })); + setColDefs(columnDefs); + setRowData(data); + } + } + , [data]); + // const [rowData, setRowData] = useState([ + // { make: "Tesla", model: "Model Y", price: 64950, electric: true }, + // { make: "Ford", model: "F-Series", price: 33850, electric: false }, + // { make: "Toyota", model: "Corolla", price: 29600, electric: false }, + // ]); + + // // Column Definitions: Defines the columns to be displayed. + // const [colDefs, setColDefs] = useState([ + // { field: "make" }, + // { field: "model" }, + // { field: "price" }, + // { field: "electric" } + // ]); + + const defaultColDef: ColDef = useMemo( + () => ({ + flex: 1, + filter: true, + enableRowGroup: true, + enableValue: true, + + editable: true, + minWidth: 170 + }), + [] + ); + + let index = 0; + const getRowId = useCallback( + ({ data: { ticker } }: GetRowIdParams) => { + index++; + return ticker || toString(index); + }, + [] + ); + + const statusBar = useMemo( + () => ({ + statusPanels: [ + { statusPanel: "agTotalAndFilteredRowCountComponent" }, + { statusPanel: "agTotalRowCountComponent" }, + { statusPanel: "agFilteredRowCountComponent" }, + { statusPanel: "agSelectedRowCountComponent" }, + { statusPanel: "agAggregationComponent" }, + ], + }), + [] + ); + + + return ( + <> +
+ +
+ + ); +} +export default FreeFormTable; \ No newline at end of file diff --git a/app/client/src/pages/DataGenerator/PCModalContent.tsx b/app/client/src/pages/DataGenerator/PCModalContent.tsx index b157ed1..83b7c0e 100644 --- a/app/client/src/pages/DataGenerator/PCModalContent.tsx +++ b/app/client/src/pages/DataGenerator/PCModalContent.tsx @@ -3,7 +3,7 @@ import styled from 'styled-components'; import Markdown from '../../components/Markdown'; import TooltipIcon from '../../components/TooltipIcon'; -import { JustificationScore, QuestionSolution } from './types'; +import { QuestionSolution } from './types'; const { Title } = Typography; const Container = styled(Flex)` diff --git a/app/client/src/pages/DataGenerator/Parameters.tsx b/app/client/src/pages/DataGenerator/Parameters.tsx index 7f1564e..9855568 100644 --- a/app/client/src/pages/DataGenerator/Parameters.tsx +++ b/app/client/src/pages/DataGenerator/Parameters.tsx @@ -1,6 +1,6 @@ import isEmpty from 'lodash/isEmpty'; import { useEffect, useRef, useState } from 'react'; -import { Col, Divider, Form, InputNumber, Row, Slider, Spin, Typography } from 'antd'; +import { Col, Divider, Form, InputNumber, Row, Slider, Typography } from 'antd'; import { merge } from 'lodash'; import styled from 'styled-components'; @@ -58,7 +58,7 @@ const Parameters = () => { const formData = form.getFieldsValue(true); const [values, setValues] = useState(formData?.model_parameters); - const { data: defaultParams, loading: loadingDefaultParams } = useFetchDefaultModelParams(); + const { data: defaultParams } = useFetchDefaultModelParams(); useEffect(() => { if (!isEmpty(formData?.model_parameters)) { @@ -185,7 +185,7 @@ const Parameters = () => { - {/* {LABELS[ModelParameters.MAX_TOKENS]}} labelCol={{ span: 24 }} @@ -215,7 +215,7 @@ const Parameters = () => { /> - */} + ) diff --git a/app/client/src/pages/DataGenerator/Prompt.tsx b/app/client/src/pages/DataGenerator/Prompt.tsx index 71c654d..8ff5ab6 100644 --- a/app/client/src/pages/DataGenerator/Prompt.tsx +++ b/app/client/src/pages/DataGenerator/Prompt.tsx @@ -77,12 +77,12 @@ const Prompt = () => { const output_key = form.getFieldValue('output_key'); const caii_endpoint = form.getFieldValue('caii_endpoint'); - const { data: defaultPrompt, loading: promptsLoading } = useFetchDefaultPrompt(useCase); + const { data: defaultPrompt, loading: promptsLoading } = useFetchDefaultPrompt(useCase, workflow_type); // Page Bootstrap requests and useEffect const { data: defaultTopics, loading: topicsLoading } = usefetchTopics(useCase); const { data: defaultSchema, loading: schemaLoading } = useFetchDefaultSchema(); - const { data: dataset_size, isLoading: datasetSizeLoadin, isError, error } = useDatasetSize( + const { data: dataset_size, isLoading: datasetSizeLoading, isError, error } = useDatasetSize( workflow_type, doc_paths, input_key, @@ -266,7 +266,8 @@ const Prompt = () => { } {isEmpty(doc_paths) && (workflow_type === WorkflowType.SUPERVISED_FINE_TUNING || - workflow_type === WorkflowType.CUSTOM_DATA_GENERATION) && + workflow_type === WorkflowType.CUSTOM_DATA_GENERATION || + workflow_type === WorkflowType.FREE_FORM_DATA_GENERATION) && { { + { validator: (_: unknown, value: string) => { if (items.includes(value)) { return Promise.reject('This seed instruction already exists in the list') } diff --git a/app/client/src/pages/DataGenerator/SeedResultTable.tsx b/app/client/src/pages/DataGenerator/SeedResultTable.tsx index 159eb07..ae416a3 100644 --- a/app/client/src/pages/DataGenerator/SeedResultTable.tsx +++ b/app/client/src/pages/DataGenerator/SeedResultTable.tsx @@ -24,7 +24,7 @@ const SeedResultTable: React.FC = ({ results }) => { forEach(seeds, (seed: string) => { const pairs = get(results, `${seed}`); if (Array.isArray(pairs)) { - forEach(pairs, (pair: any) => { + forEach(pairs, (pair: unknown) => { data.push({ seed, question: get(pair, `question`), diff --git a/app/client/src/pages/DataGenerator/Summary.tsx b/app/client/src/pages/DataGenerator/Summary.tsx index 797f110..5aeff01 100644 --- a/app/client/src/pages/DataGenerator/Summary.tsx +++ b/app/client/src/pages/DataGenerator/Summary.tsx @@ -6,6 +6,7 @@ import PCModalContent from './PCModalContent' import { MODEL_PROVIDER_LABELS } from './constants' import { ModelParameters } from '../../types'; import { ModelProviders, QuestionSolution, Usecases } from './types'; +import FreeFormExampleTable from './FreeFormExampleTable'; const { Title } = Typography; const MODEL_PARAMETER_LABELS: Record = { @@ -46,10 +47,11 @@ const Summary= () => { num_questions, custom_prompt, model_parameters, + workflow_type, topics = [], schema, examples = [] - } = form.getFieldsValue(true) + } = form.getFieldsValue(true); const cfgStepDataSource = [ { label: 'Dataset Name', children: display_name }, @@ -72,7 +74,7 @@ const Summary= () => { ellipsis: true, render: (_text: QuestionSolution, record: QuestionSolution) => <>{record.solution} }, - ] + ]; return ( @@ -133,9 +135,11 @@ const Summary= () => { )} - {isEmpty(examples) && + {!isEmpty(examples) &&
{'Examples'} + {workflow_type === 'freeform' ? + : { }) })} rowKey={(_record, index) => `summary-examples-table-${index}`} - /> + />}
}
) diff --git a/app/client/src/pages/DataGenerator/constants.ts b/app/client/src/pages/DataGenerator/constants.ts index 4e5549a..d90b5b2 100644 --- a/app/client/src/pages/DataGenerator/constants.ts +++ b/app/client/src/pages/DataGenerator/constants.ts @@ -19,7 +19,8 @@ export const USECASE_OPTIONS = [ export const WORKFLOW_OPTIONS = [ { label: 'Supervised Fine-Tuning', value: 'sft' }, - { label: 'Custom Data Generation', value: 'custom' } + { label: 'Custom Data Generation', value: 'custom' }, + { label: 'Freeform Data Generation', value: 'freeform' } ]; export const MODEL_TYPE_OPTIONS: ModelProvidersDropdownOpts = [ diff --git a/app/client/src/pages/DataGenerator/hooks.ts b/app/client/src/pages/DataGenerator/hooks.ts index 61bb2a5..eab4e99 100644 --- a/app/client/src/pages/DataGenerator/hooks.ts +++ b/app/client/src/pages/DataGenerator/hooks.ts @@ -3,13 +3,12 @@ import get from 'lodash/get'; import toNumber from 'lodash/toNumber'; import isEmpty from 'lodash/isEmpty'; import isString from 'lodash/isString'; -import { useState } from 'react'; import { useMutation, useQuery } from '@tanstack/react-query'; import { WorkflowType } from './types'; const BASE_API_URL = import.meta.env.VITE_AMP_URL; -export const fetchPrompt = async (use_case: string, params: any) => { +export const fetchPrompt = async (use_case: string, params: unknown) => { if (use_case !== 'custom') { const resp = await fetch(`${BASE_API_URL}/${use_case}/gen_prompt`, { method: 'GET' @@ -63,7 +62,7 @@ export const useGetPromptByUseCase = (use_case: string, { model_id, inference_ty }; } -export const fetchCustomPrompt = async (params: any) => { +export const fetchCustomPrompt = async (params: unknown) => { if (params.use_case !== 'custom') { const resp = await fetch(`${BASE_API_URL}/${params.use_case}/gen_prompt`, { method: 'GET' @@ -89,7 +88,24 @@ export const fetchCustomPrompt = async (params: any) => { } } -export const listModels = async (params: any) => { +export const fetchFileContent = async (params: unknown) => { + const resp = await fetch(`${BASE_API_URL}/json/get_content`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify(params), + }); + if (resp.status !== 200) { + const error = await resp.json(); + throw new Error(error.message || error.detail); + } + const body = await resp.json(); + const content = get(body, 'data'); + return content; +} + +export const listModels = async (params: unknown) => { const resp = await fetch(`${BASE_API_URL}/model/model_ID`, { method: 'POST', headers: { @@ -105,7 +121,7 @@ export const listModels = async (params: any) => { return body; } -export const listFilesByPath = async (params: any) => { +export const listFilesByPath = async (params: unknown) => { const resp = await fetch(`${BASE_API_URL}/get_project_files`, { method: 'POST', headers: { @@ -119,7 +135,7 @@ export const listFilesByPath = async (params: any) => { } const body = await resp.json(); const _files = get(body, '_files'); - const files = _files.map((_file: any) => { + const files = _files.map((_file: unknown) => { const name = get(_file, '_path'); const size = toNumber(get(_file, '_file_size')); const _is_dir = get(_file, '_is_dir') @@ -135,9 +151,7 @@ export const listFilesByPath = async (params: any) => { return files; } -export const useGetProjectFiles = (paths: string[]) => { - const [files, setFiles] = useState([]); - +export const useGetProjectFiles = () => { const mutation = useMutation({ mutationFn: listFilesByPath }); @@ -145,12 +159,12 @@ export const useGetProjectFiles = (paths: string[]) => { if (mutation.isError) { notification.error({ message: 'Error', - description: `An error occurred while fetching the prompt.\n ${mutation.error}` + description: `An error occurred while fetching the list of project files.\n ${mutation.error}` }); } return { listProjectFiles: mutation.mutate, - fetching: mutation.isLoading, + fetching: mutation.isPending, error: mutation.error, isError: mutation.isError, data: mutation.data @@ -158,7 +172,7 @@ export const useGetProjectFiles = (paths: string[]) => { }; - export const fetchDatasetSize = async (params: any) => { + export const fetchDatasetSize = async (params: unknown) => { const resp = await fetch(`${BASE_API_URL}/json/dataset_size`, { method: 'POST', headers: { diff --git a/app/client/src/pages/DataGenerator/types.ts b/app/client/src/pages/DataGenerator/types.ts index 38a8f25..00be477 100644 --- a/app/client/src/pages/DataGenerator/types.ts +++ b/app/client/src/pages/DataGenerator/types.ts @@ -35,7 +35,7 @@ export interface GenDatasetRequest { topics?: string[]; use_case?: Usecases is_demo?: boolean; - results?: any + results?: unknown } export interface GenDatasetResponse { @@ -104,7 +104,8 @@ export interface File { export enum WorkflowType { SUPERVISED_FINE_TUNING = 'supervised-fine-tuning', - CUSTOM_DATA_GENERATION = "custom" + CUSTOM_DATA_GENERATION = "custom", + FREE_FORM_DATA_GENERATION = "freeform" } export interface CustomResult { @@ -114,5 +115,6 @@ export interface CustomResult { export enum TechniqueType { SFT = 'sft', - CUSTOME_WORKFLOW = 'custom_workflow' + CUSTOME_WORKFLOW = 'custom_workflow', + FREE_FORM = 'freeform' } \ No newline at end of file diff --git a/app/client/src/pages/DataGenerator/utils.ts b/app/client/src/pages/DataGenerator/utils.ts index ab9f784..6f7e932 100644 --- a/app/client/src/pages/DataGenerator/utils.ts +++ b/app/client/src/pages/DataGenerator/utils.ts @@ -37,3 +37,35 @@ export const fromNow = time => { } return moment(time).fromNow(); }; + +export const sampleExamplesData = [ + { + "loan_amnt": 10000.00, + "term": "36 months", + "int_rate": 11.44, + "installment": 329.48, + "grade": "B", + "sub_grade": "B4", + "emp_title": "Marketing", + "emp_length": "10+ years", + "home_ownership": "RENT", + "annual_inc": 117000.00, + "verification_status": "Not Verified", + "issue_d": "Jan-2015", + "loan_status": "Fully Paid", + "purpose": "vacation", + "title": "Vacation", + "dti": 26.24, + "earliest_cr_line": "Jun-1990", + "open_acc": 16.00, + "pub_rec": 0.00, + "revol_bal": 36369.00, + "revol_util": 41.80, + "total_acc": 25.00, + "initial_list_status": "w", + "application_type": "INDIVIDUAL", + "mort_acc": 0.00, + "pub_rec_bankruptcies": 0.00, + "address": "0185 Michelle Gateway\r\nMendozaberg, OK 22690" + } +]; diff --git a/app/client/src/pages/DatasetDetails/ConfigurationTab.tsx b/app/client/src/pages/DatasetDetails/ConfigurationTab.tsx index 138adbb..b16ec0c 100644 --- a/app/client/src/pages/DatasetDetails/ConfigurationTab.tsx +++ b/app/client/src/pages/DatasetDetails/ConfigurationTab.tsx @@ -6,6 +6,7 @@ import { Col, Flex, Modal, Row, Space, Table, Tag, Typography } from 'antd'; import ExampleModal from './ExampleModal'; import { QuestionSolution } from '../DataGenerator/types'; import styled from 'styled-components'; +import FreeFormExampleTable from '../DataGenerator/FreeFormExampleTable'; const { Text } = Typography; @@ -149,6 +150,8 @@ const ConfigurationTab: React.FC = ({ dataset }) => { Examples + {dataset.technique === 'freeform' && } + {dataset.technique !== 'freeform' && = ({ dataset }) => { }) })} rowKey={(_record, index) => `summary-examples-table-${index}`} - /> + />} diff --git a/app/client/src/pages/DatasetDetails/CustomGenerationTable.tsx b/app/client/src/pages/DatasetDetails/CustomGenerationTable.tsx index 1e2ec19..776276f 100644 --- a/app/client/src/pages/DatasetDetails/CustomGenerationTable.tsx +++ b/app/client/src/pages/DatasetDetails/CustomGenerationTable.tsx @@ -1,7 +1,5 @@ import React, { SyntheticEvent, useEffect } from 'react'; - import { Col, Input, Row, Table } from 'antd'; -import { CustomResult } from '../DataGenerator/types'; import { DatasetGeneration } from '../Home/types'; import { sortItemsByKey } from '../../utils/sortutils'; import { SearchProps } from 'antd/es/input/Search'; @@ -51,7 +49,7 @@ const CustomGenerationTable: React.FC = ({ results }) => { } ]; - const onSearch: SearchProps['onSearch'] = (value, _e, info) => { + const onSearch: SearchProps['onSearch'] = (value) => { throttle((value: string) => setSearchQuery(value), 500)(value); } diff --git a/app/client/src/pages/DatasetDetails/DatasetDetailsPage.tsx b/app/client/src/pages/DatasetDetails/DatasetDetailsPage.tsx index 3ef5175..b5de8a0 100644 --- a/app/client/src/pages/DatasetDetails/DatasetDetailsPage.tsx +++ b/app/client/src/pages/DatasetDetails/DatasetDetailsPage.tsx @@ -24,21 +24,6 @@ import { getFilesURL } from '../Evaluator/util'; const { Content } = Layout; const { Title } = Typography; - -const StyledHeader = styled.div` - height: 28px; - flex-grow: 0; - font-family: Roboto, -apple-system, 'Segoe UI', sans-serif; - color: #5a656d; - font-size: 24px; - font-weight: 300; - font-stretch: normal; - font-style: normal; - line-height: 1.4; - letter-spacing: normal; - text-align: left; -`; - const StyledLabel = styled.div` margin-bottom: 4px; font-family: Roboto, -apple-system, 'Segoe UI', sans-serif; @@ -221,7 +206,7 @@ const DatasetDetailsPage: React.FC = () => { - Files + Context {/* {dataset?.custom_prompt} */} diff --git a/app/client/src/pages/DatasetDetails/DatasetGenerationTab.tsx b/app/client/src/pages/DatasetDetails/DatasetGenerationTab.tsx index 9341cf2..5e4419f 100644 --- a/app/client/src/pages/DatasetDetails/DatasetGenerationTab.tsx +++ b/app/client/src/pages/DatasetDetails/DatasetGenerationTab.tsx @@ -5,8 +5,8 @@ import styled from 'styled-components'; import { Dataset } from '../Evaluator/types'; import CustomGenerationTable from './CustomGenerationTable'; import DatasetGenerationTopics from './DatasetGenerationTopics'; -import { CustomResult } from "../DataGenerator/types"; import { DatasetDetails, DatasetGeneration } from '../Home/types'; +import DatasetViewer from './DatasetViewer'; @@ -23,19 +23,17 @@ const Container = styled.div` const DatasetGenerationTab: React.FC = ({ dataset, datasetDetails }) => { - console.log(`DatasetGenerationTab > dataset`, dataset); - console.log(` datasetDetails`, datasetDetails); const topics = get(dataset, 'topics', []); - console.log(` topics`, topics); + const technique = get(dataset, 'technique'); const hasCustomSeeds = !Array.isArray(datasetDetails?.generation) || isEmpty(topics) || topics !== null; - console.log(` hasCustomSeeds`, hasCustomSeeds); return ( - {hasCustomSeeds && } - {!hasCustomSeeds && } + {technique === 'freeform' && } + {(technique !== 'freeform' && hasCustomSeeds) && } + {(technique !== 'freeform' && !hasCustomSeeds) && } diff --git a/app/client/src/pages/DatasetDetails/DatasetGenerationTopics.tsx b/app/client/src/pages/DatasetDetails/DatasetGenerationTopics.tsx index 3d5d529..c74bed7 100644 --- a/app/client/src/pages/DatasetDetails/DatasetGenerationTopics.tsx +++ b/app/client/src/pages/DatasetDetails/DatasetGenerationTopics.tsx @@ -1,44 +1,17 @@ import get from 'lodash/get'; -import { Card, Table, Tabs, Typography } from "antd"; +import { Card, Tabs, Typography } from "antd"; import { DatasetGeneration } from "../Home/types"; import TopicGenerationTable from "./TopicGenerationTable"; import isEmpty from "lodash/isEmpty"; import styled from "styled-components"; import { Dataset } from '../Evaluator/types'; +import FreeFormTable from '../DataGenerator/FreeFormTable'; interface Props { data: DatasetGeneration; dataset: Dataset; } -const StyledTable = styled(Table)` - font-family: Roboto, -apple-system, 'Segoe UI', sans-serif; - color: #5a656d; - .ant-table-thead > tr > th { - color: #5a656d; - border-bottom: 1px solid #eaebec; - font-weight: 500; - text-align: left; - // background: #ffffff; - border-bottom: 1px solid #eaebec; - transition: background 0.3s ease; - } - .ant-table-row { - cursor: pointer; - } - .ant-table-row > td.ant-table-cell { - padding: 8px; - padding-left: 16px; - font-size: 13px; - font-family: Roboto, -apple-system, 'Segoe UI', sans-serif; - color: #5a656d; - .ant-typography { - font-size: 13px; - font-family: Roboto, -apple-system, 'Segoe UI', sans-serif; - } - } -`; - const TabsContainer = styled(Card)` .ant-card-body { padding: 0; @@ -59,6 +32,7 @@ const getTopicTree = (data: DatasetGeneration, topics: string[]) => { const DatasetGenerationTable: React.FC = ({ data, dataset }) => { const topics = get(dataset, 'topics', []); + const technique = get(dataset, 'technique'); const topicTree = getTopicTree(data, topics); let topicTabs = []; @@ -67,7 +41,9 @@ const DatasetGenerationTable: React.FC = ({ data, dataset }) => { key: `${topic}-${i}`, label: {topic}, value: topic, - children: + children: technique !== 'freefoem' ? + : + })); } diff --git a/app/client/src/pages/DatasetDetails/DatasetViewer.tsx b/app/client/src/pages/DatasetDetails/DatasetViewer.tsx new file mode 100644 index 0000000..7db1530 --- /dev/null +++ b/app/client/src/pages/DatasetDetails/DatasetViewer.tsx @@ -0,0 +1,44 @@ +import { FunctionComponent, useEffect } from "react"; +import { Dataset } from '../Evaluator/types'; +import { useMutation } from "@tanstack/react-query"; +import { fetchFileContent } from "../DataGenerator/hooks"; +import get from "lodash/get"; +import isEmpty from "lodash/isEmpty"; +import { Col, Row } from "antd"; +import FreeFormTable from "../DataGenerator/FreeFormTable"; + +interface Props { + dataset: Dataset; +} + + +const DatasetViewer: FunctionComponent = ({ dataset }) => { + const mutation = useMutation({ + mutationFn: fetchFileContent + }); + + useEffect(() => { + const generate_file_name = get(dataset, 'generate_file_name'); + if (!isEmpty(generate_file_name)) { + mutation.mutate({ + path: generate_file_name + }); + } + }, [dataset]); + + + return ( + + +
+ {mutation.isLoading &&

Loading...

} + {mutation.isError &&

Error: {mutation.error}

} + {mutation.isSuccess && ( + + )} +
+ +
+ ); +} +export default DatasetViewer; \ No newline at end of file diff --git a/app/client/src/pages/DatasetDetails/ExampleModal.tsx b/app/client/src/pages/DatasetDetails/ExampleModal.tsx index 8443537..df3123a 100644 --- a/app/client/src/pages/DatasetDetails/ExampleModal.tsx +++ b/app/client/src/pages/DatasetDetails/ExampleModal.tsx @@ -1,4 +1,4 @@ -import { Flex, Form, Typography } from 'antd'; +import { Flex, Typography } from 'antd'; import styled from 'styled-components'; import Markdown from '../../components/Markdown'; diff --git a/app/client/src/pages/DatasetDetails/ExamplesSection.tsx b/app/client/src/pages/DatasetDetails/ExamplesSection.tsx index aaf5d52..9d1eb02 100644 --- a/app/client/src/pages/DatasetDetails/ExamplesSection.tsx +++ b/app/client/src/pages/DatasetDetails/ExamplesSection.tsx @@ -1,15 +1,12 @@ -import { Collapse, Descriptions, Flex, Modal, Table, Typography } from "antd"; +import { Collapse, Flex, Modal, Table } from "antd"; import styled from "styled-components"; -import Markdown from "../../Markdown"; import { DatasetResponse } from "../../../api/Datasets/response"; import { QuestionSolution } from "../../../pages/DataGenerator/types"; -import { MODEL_PARAMETER_LABELS, ModelParameters, Usecases } from "../../../types"; import { Dataset } from "../../../pages/Evaluator/types"; -import PCModalContent from "../../../pages/DataGenerator/PCModalContent"; import ExampleModal from "./ExampleModal"; +import FreeFormExampleTable from "../DataGenerator/FreeFormExampleTable"; -const { Text, Title } = Typography; const Panel = Collapse.Panel; @@ -41,16 +38,7 @@ const StyledTable = styled(Table)` } `; -const MarkdownWrapper = styled.div` - border: 1px solid #d9d9d9; - border-radius: 6px; - padding: 4px 11px; -`; -const StyledLabel = styled.div` - font-size: 16px; - padding-top: 8px; -`; const StyledCollapse = styled(Collapse)` .ant-collapse-content > .ant-collapse-content-box { @@ -74,6 +62,7 @@ export type DatasetDetailProps = { } const ExamplesSection= ({ datasetDetails }: DatasetDetailProps) => { + const { technique } = datasetDetails; const exampleCols = [ { @@ -99,6 +88,11 @@ const ExamplesSection= ({ datasetDetails }: DatasetDetailProps) => { style={{ padding: 0 }} > + {technique === 'freeform' ? ( + + ) : { }) })} rowKey={(_record, index) => `summary-examples-table-${index}`} - /> - - {/* Model Parameters - ({ - label: MODEL_PARAMETER_LABELS[modelParameterKey as ModelParameters], - children: datasetDetails.model_parameters[modelParameterKey as ModelParameters], - })) - : []}> - - {(datasetDetails.schema && datasetDetails.use_case === Usecases.TEXT2SQL) && ( -
- {'DB Schema'} - - - -
- )} */} - + />}
diff --git a/app/client/src/pages/DatasetDetails/TopicGenerationTable.tsx b/app/client/src/pages/DatasetDetails/TopicGenerationTable.tsx index 3626f55..a9c2fc7 100644 --- a/app/client/src/pages/DatasetDetails/TopicGenerationTable.tsx +++ b/app/client/src/pages/DatasetDetails/TopicGenerationTable.tsx @@ -1,7 +1,5 @@ import React, { SyntheticEvent, useEffect } from 'react'; - import { Col, Input, Row, Table } from 'antd'; -import { CustomResult } from '../DataGenerator/types'; import { DatasetGeneration } from '../Home/types'; import throttle from 'lodash/throttle'; import { SearchProps } from 'antd/es/input'; @@ -52,7 +50,7 @@ const TopicGenerationTable: React.FC = ({ results }) => { } ]; - const onSearch: SearchProps['onSearch'] = (value, _e, info) => { + const onSearch: SearchProps['onSearch'] = (value) => { throttle((value: string) => setSearchQuery(value), 500)(value); } diff --git a/app/client/src/pages/DatasetDetails/constants.tsx b/app/client/src/pages/DatasetDetails/constants.tsx index 4faa3b4..4446138 100644 --- a/app/client/src/pages/DatasetDetails/constants.tsx +++ b/app/client/src/pages/DatasetDetails/constants.tsx @@ -1,6 +1,4 @@ -import { HomeOutlined, PageviewOutlined } from '@mui/icons-material'; import AssessmentIcon from '@mui/icons-material/Assessment'; -import CheckCircleIcon from '@mui/icons-material/CheckCircle' import GradingIcon from '@mui/icons-material/Grading'; import ModelTrainingIcon from '@mui/icons-material/ModelTraining'; diff --git a/app/client/src/pages/EvaluationDetails/EvaluationConfigurationTab.tsx b/app/client/src/pages/EvaluationDetails/EvaluationConfigurationTab.tsx index c445348..f4dff95 100644 --- a/app/client/src/pages/EvaluationDetails/EvaluationConfigurationTab.tsx +++ b/app/client/src/pages/EvaluationDetails/EvaluationConfigurationTab.tsx @@ -1,8 +1,6 @@ import { Badge, Col, Flex, Modal, Row, Table, Typography } from "antd"; import { Evaluation } from "../Evaluator/types"; import styled from "styled-components"; -import { QuestionSolution } from "../DataGenerator/types"; -import isEmpty from "lodash/isEmpty"; import ExampleModal from "../DatasetDetails/ExampleModal"; import { getColorCode } from "../Evaluator/util"; diff --git a/app/client/src/pages/EvaluationDetails/EvaluationGenerationTab.tsx b/app/client/src/pages/EvaluationDetails/EvaluationGenerationTab.tsx index d9dfcbd..d2e5a7d 100644 --- a/app/client/src/pages/EvaluationDetails/EvaluationGenerationTab.tsx +++ b/app/client/src/pages/EvaluationDetails/EvaluationGenerationTab.tsx @@ -5,6 +5,7 @@ import { Dataset, Evaluation, EvaluationDetails } from '../Evaluator/types'; import styled from 'styled-components'; import { getTopicMap } from '../Evaluator/util'; import EvaluateTopicTable from '../Evaluator/EvaluateTopicTable'; +import FreeFormEvaluationTable from '../Evaluator/FreeFromEvaluationTable'; interface Props { @@ -18,12 +19,13 @@ const Container = styled.div` background-color: #ffffff; `; -const EvaluationGenerationTab: React.FC = ({ dataset, evaluation, evaluationDetails }) => { +const EvaluationGenerationTab: React.FC = ({ dataset, evaluationDetails }) => { const result = get(evaluationDetails, 'evaluation'); + const isFreeForm = get(dataset, 'technique' , false) === 'freeform'; - let topicTabs: any[] = []; + let topicTabs: unknown[] = []; const { topics, topicMap } = getTopicMap({ result }); - if (dataset.topics !== null && !isEmpty(dataset.topics)) { + if (dataset.topics !== null && !isEmpty(dataset.topics) && !isFreeForm) { topicTabs = topics.map((topicName: string, index: number) => ({ key: `${topicName}-${index}`, label: topicName, @@ -32,7 +34,7 @@ const EvaluationGenerationTab: React.FC = ({ dataset, evaluation, evaluat })); } - if (isEmpty(topicTabs)) { + if (isEmpty(topicTabs) && !isFreeForm) { const values = Object.values(topicMap); return ( @@ -43,6 +45,16 @@ const EvaluationGenerationTab: React.FC = ({ dataset, evaluation, evaluat ); } + if (isFreeForm) { + return ( + + + + + + ); + } + return ( diff --git a/app/client/src/pages/EvaluationDetails/hooks.ts b/app/client/src/pages/EvaluationDetails/hooks.ts index 7438e51..08a70a4 100644 --- a/app/client/src/pages/EvaluationDetails/hooks.ts +++ b/app/client/src/pages/EvaluationDetails/hooks.ts @@ -31,11 +31,7 @@ const fetchEvaluationDetails = async (evaluate_file_name: string) => { queryKey: ['data', fetchEvaluationDetails], queryFn: () => fetchEvaluationDetails(generate_file_name), placeholderData: (previousData) => previousData - }); - - // const dataset = get(data, 'dataset'); - console.log('data:', data); - console.log('error:', error); + }); if (error) { notification.error({ diff --git a/app/client/src/pages/Evaluator/EvaluateExampleTable.tsx b/app/client/src/pages/Evaluator/EvaluateExampleTable.tsx index 6a8761a..53cf280 100644 --- a/app/client/src/pages/Evaluator/EvaluateExampleTable.tsx +++ b/app/client/src/pages/Evaluator/EvaluateExampleTable.tsx @@ -5,7 +5,7 @@ import { Dataset, EvaluateExample, EvaluateExampleRecord } from "./types"; import React, { useEffect, useState } from 'react'; -import { DeleteOutlined, EditOutlined, Add } from "@mui/icons-material"; +import { DeleteOutlined, EditOutlined } from "@mui/icons-material"; import TooltipIcon from "../../components/TooltipIcon"; import StyledTitle from "./StyledTitle"; import styled from "styled-components"; @@ -79,7 +79,7 @@ const EvaluateExampleTable: React.FC = ({ examples, form }) => { } const onDelete = (index: number) => { - let _promptExamples = clone(evaluateExamples); + const _promptExamples = clone(evaluateExamples); pullAt(_promptExamples, index); setEvaluateExamples(_promptExamples); } @@ -142,7 +142,7 @@ const EvaluateExampleTable: React.FC = ({ examples, form }) => {
- diff --git a/app/client/src/pages/Evaluator/EvaluatorPage.tsx b/app/client/src/pages/Evaluator/EvaluatorPage.tsx index b1f9abb..9864420 100644 --- a/app/client/src/pages/Evaluator/EvaluatorPage.tsx +++ b/app/client/src/pages/Evaluator/EvaluatorPage.tsx @@ -1,8 +1,6 @@ import get from 'lodash/get'; -import set from 'lodash/set'; import isEmpty from 'lodash/isEmpty'; import React, { useEffect, useState } from 'react'; -import { useMutation } from '@tanstack/react-query'; import { useParams } from 'react-router-dom'; import { ModelParameters } from '../../types'; import { Button, Form, FormInstance, Result } from 'antd'; @@ -34,7 +32,7 @@ const EvaluatorPage: React.FC = () => { const values = form.getFieldsValue(); form.setFieldsValue({ ...values, - custom_prompt: '' || prompt, + custom_prompt: prompt || '', top_p: get(parameters, 'top_p'), top_k: get(parameters, 'top_k'), min_p: get(parameters, 'min_p'), @@ -48,16 +46,18 @@ const EvaluatorPage: React.FC = () => { } }, [dataset]); -const mutation = useMutation(async (formData) => { - const response = await fetch(`${BASE_API_URL}/synthesis/evaluate`, { - method: 'POST', - body: JSON.stringify(formData), - }); - return response.json(); - }); +// const mutation = useMutation(async (formData) => { +// const response = await fetch(`${BASE_API_URL}/synthesis/evaluate`, { +// method: 'POST', +// body: JSON.stringify(formData), +// }); +// return response.json(); +// }); -const evaluateDataset = async (formData: any) => { - const response = await fetch(`${BASE_API_URL}/synthesis/evaluate`, { +const evaluateDataset = async (formData: unknown) => { + const url = dataset.technique === 'freeforms' ? + `${BASE_API_URL}/synthesis/evaluate` : `${BASE_API_URL}/synthesis/evaluate_freeform`; + const response = await fetch(url, { method: 'POST', headers: { 'Content-Type': 'application/json', diff --git a/app/client/src/pages/Evaluator/EvaluatorSuccess.tsx b/app/client/src/pages/Evaluator/EvaluatorSuccess.tsx index 06ece55..106c43d 100644 --- a/app/client/src/pages/Evaluator/EvaluatorSuccess.tsx +++ b/app/client/src/pages/Evaluator/EvaluatorSuccess.tsx @@ -1,6 +1,6 @@ import get from 'lodash/get'; import isEmpty from 'lodash/isEmpty'; -import React, { useState } from 'react'; +import React from 'react'; import { Link } from 'react-router-dom'; import { Avatar, Button, Card, Flex, Layout, List, Tabs, Typography } from 'antd'; import CheckCircleIcon from '@mui/icons-material/CheckCircle'; @@ -13,13 +13,14 @@ import { getProjectJobsUrl } from './hooks'; import { Dataset } from './types'; import { WorkflowType } from '../DataGenerator/types'; import SeedEvaluateTable from './SeedEvaluateTable'; +import FreeFromEvaluationTable from './FreeFromEvaluationTable'; const { Content } = Layout; const { Title } = Typography; interface Props { - result: any; + result: unknown; demo: boolean; dataset: Dataset; } @@ -40,10 +41,13 @@ const StyleContent = styled(Content)` const EvaluatorSuccess: React.FC = ({ result, dataset, demo }) => { - const hasTopics = (result: any) => { - return !Array.isArray(result?.results) + const hasTopics = (result: unknown) => { + return !Array.isArray(result?.results); } + const isFreeForm = (dataset: Dataset) => + dataset?.technique === 'freeform'; + const hasCustomSeed = (_dataset: Dataset) => (_dataset?.technique === 'sft' && !isEmpty(_dataset?.doc_paths)) || (_dataset?.technique === WorkflowType.CUSTOM_DATA_GENERATION && !isEmpty(_dataset?.input_path)) @@ -88,9 +92,14 @@ const EvaluatorSuccess: React.FC = ({ result, dataset, demo }) => { {'Your dataset evaluation was successfully generated. You can review your evaluation in the table below.'} - {!isCustom && !isEmpty(topicTabs) && - - } + {!isCustom && !isEmpty(topicTabs) && !isFreeForm(dataset) && + + + } + {isFreeForm(dataset) && + + + } } {isCustom && <> diff --git a/app/client/src/pages/Evaluator/FreeFromEvaluationTable.tsx b/app/client/src/pages/Evaluator/FreeFromEvaluationTable.tsx new file mode 100644 index 0000000..6e905a8 --- /dev/null +++ b/app/client/src/pages/Evaluator/FreeFromEvaluationTable.tsx @@ -0,0 +1,140 @@ +import first from 'lodash/first'; +import isEmpty from 'lodash/isEmpty'; +import React, { useCallback, useEffect, useMemo, useState } from 'react'; +import { AgGridReact } from 'ag-grid-react'; +import Paragraph from 'antd/es/typography/Paragraph'; +// import { TextFilterModule } from 'ag-grid-community'; +// import { NumberFilterModule } from 'ag-grid-community'; +// import { DateFilterModule } from 'ag-grid-community'; +import { + // ModuleRegistry, + // ClientSideRowModelModule, + // ValidationModule, + type ColDef, + type GetRowIdFunc, + type GetRowIdParams + } from 'ag-grid-community'; + + import { themeMaterial } from "ag-grid-community"; +import get from 'lodash/get'; +import { getColorCode } from './util'; +import { Badge, Popover, Tooltip } from 'antd'; +import styled from 'styled-components'; + +interface Props { + data: unknown[]; +} + +const StyledParagraph = styled(Paragraph)` + font-size: 13px; + font-family: Roboto, -apple-system, 'Segoe UI', sans-serif; + color: #5a656d; +`; + +const FreeFormEvaluationTable: React.FC = ({ data }) => { + const [colDefs, setColDefs] = useState([]); + const [rowData, setRowData] = useState([]); + + useEffect(() => { + if (!isEmpty(data)) { + const rows = data.map((item) => { + const row = get(item, 'row'); + return { + score: get(item, 'evaluation.score'), + justification: get(item, 'evaluation.justification'), + ...row + } + + }); + + const columnNames = Object.keys(first(rows)); + const columnDefs = columnNames.map((colName) => { + const columnDef = { + field: colName, + headerName: colName, + width: 250, + filter: true, + sortable: true, + resizable: true + } + if (colName === 'score') { + columnDef['width'] = 120 + columnDef['cellRenderer'] = (params: unknown) => { + return + } + } else if (colName === 'justification') { + columnDef['cellRenderer'] = (params: unknown) => ( + + + {params.value} + + + ); + } + + return columnDef; + }); + setColDefs(columnDefs); + setRowData(rows); + } + }, [data]); + + const defaultColDef: ColDef = useMemo( + () => ({ + flex: 1, + filter: true, + enableRowGroup: true, + enableValue: true, + editable: true, + minWidth: 170 + }), + [] + ); + + let index = 0; + const getRowId = useCallback( + ({ data: { ticker } }: GetRowIdParams) => { + index++; + return ticker || toString(index); + }, + [] + ); + + const statusBar = useMemo( + () => ({ + statusPanels: [ + { statusPanel: "agTotalAndFilteredRowCountComponent" }, + { statusPanel: "agTotalRowCountComponent" }, + { statusPanel: "agFilteredRowCountComponent" }, + { statusPanel: "agSelectedRowCountComponent" }, + { statusPanel: "agAggregationComponent" }, + ], + }), + [] + ); + + + return ( + <> +
+ +
+ + ); +} + +export default FreeFormEvaluationTable; \ No newline at end of file diff --git a/app/client/src/pages/Evaluator/GeneratedEvaluationModal.tsx b/app/client/src/pages/Evaluator/GeneratedEvaluationModal.tsx index 3df4f4e..057fcc0 100644 --- a/app/client/src/pages/Evaluator/GeneratedEvaluationModal.tsx +++ b/app/client/src/pages/Evaluator/GeneratedEvaluationModal.tsx @@ -2,14 +2,12 @@ import get from 'lodash/get'; import isString from 'lodash/isString'; import React from 'react'; import { EvaluatedPair } from "./types"; -import { Badge, Button, Flex, Layout, Modal, Tooltip } from 'antd'; +import { Badge, Button, Flex, Modal, Tooltip } from 'antd'; import { QuestionCircleOutlined } from '@ant-design/icons'; import styled from 'styled-components'; import Markdown from '../../components/Markdown'; import { getColorCode } from './util'; -const { Content } = Layout; - interface Props { evaluatedPair: EvaluatedPair; onClose: () => void; diff --git a/app/client/src/pages/Evaluator/ReevaluatorPage.tsx b/app/client/src/pages/Evaluator/ReevaluatorPage.tsx index 8d9e841..d8f3a97 100644 --- a/app/client/src/pages/Evaluator/ReevaluatorPage.tsx +++ b/app/client/src/pages/Evaluator/ReevaluatorPage.tsx @@ -21,8 +21,7 @@ const ReevaluatorPage: React.FC = () => { evaluate, dataset, prompt, - examples, - isLoading + examples } = useGetEvaluate(evaluate_file_name as string); const modelsReq = useModels(); @@ -33,9 +32,8 @@ const ReevaluatorPage: React.FC = () => { useEffect(() => { if (!isEmpty(evaluate)) { const parameters: ModelParameters = get(evaluate, 'model_parameters'); - console.log('parameters', parameters); const values = form.getFieldsValue(); - console.log('prompt', prompt); + form.setFieldsValue({ ...values, display_name: get(evaluate, 'display_name'), @@ -52,7 +50,7 @@ const ReevaluatorPage: React.FC = () => { } }, [evaluate]); - const evaluateDataset = async (formData: any) => { + const evaluateDataset = async (formData: unknown) => { const response = await fetch(`${BASE_API_URL}/synthesis/evaluate`, { method: 'POST', headers: { diff --git a/app/client/src/pages/Evaluator/SeedEvaluateTable.tsx b/app/client/src/pages/Evaluator/SeedEvaluateTable.tsx index 668667c..9b9f877 100644 --- a/app/client/src/pages/Evaluator/SeedEvaluateTable.tsx +++ b/app/client/src/pages/Evaluator/SeedEvaluateTable.tsx @@ -6,7 +6,7 @@ import { getColorCode } from './util'; import { Badge, Table } from 'antd'; interface Props { - results: any; + results: unknown; } const SeedEvaluateTable: React.FC = ({ results }) => { @@ -17,10 +17,10 @@ const SeedEvaluateTable: React.FC = ({ results }) => { } const seeds = Object.values(result); const data = []; - forEach(seeds, (seed: any) => { + forEach(seeds, (seed: unknown) => { const pairs = get(seed, `evaluated_pairs`); if (Array.isArray(pairs)) { - forEach(pairs, (pair: any) => { + forEach(pairs, (pair: unknown) => { data.push({ seed, question: get(pair, `question`), diff --git a/app/client/src/pages/Home/DatasetsTab.tsx b/app/client/src/pages/Home/DatasetsTab.tsx index 84b54ea..0149df5 100644 --- a/app/client/src/pages/Home/DatasetsTab.tsx +++ b/app/client/src/pages/Home/DatasetsTab.tsx @@ -86,7 +86,7 @@ const DatasetsTab: React.FC = () => { } }, [exportResult, notificationInstance]) - const onSearch: SearchProps['onSearch'] = (value, _e, info) => { + const onSearch: SearchProps['onSearch'] = (value: unknown) => { throttle((value: string) => setSearchQuery(value), 500)(value); } @@ -116,12 +116,14 @@ const DatasetsTab: React.FC = () => { title: 'Dataset Name', dataIndex: 'generate_file_name', sorter: sortItemsByKey('generate_file_name'), + width: 250, render: (generate_file_name) => {generate_file_name} }, { key: 'model_id', title: 'Model', dataIndex: 'model_id', sorter: sortItemsByKey('model_id'), + width: 250, render: (modelId) => {modelId} }, { key: 'num_questions', diff --git a/app/client/src/pages/Home/EvaluateButton.tsx b/app/client/src/pages/Home/EvaluateButton.tsx index 4cb5d32..f9ea18a 100644 --- a/app/client/src/pages/Home/EvaluateButton.tsx +++ b/app/client/src/pages/Home/EvaluateButton.tsx @@ -8,7 +8,6 @@ import { isEmpty } from "lodash"; import { Dataset } from "../Evaluator/types"; import { Pages } from "../../types"; -const { Option } = Select; const EvaluateButton: React.FC = () => { const [form] = Form.useForm(); @@ -40,7 +39,7 @@ const EvaluateButton: React.FC = () => { } } - const options = datasets.map((dataset: any) => ({ + const options = datasets.map((dataset: unknown) => ({ value: dataset.display_name, label: dataset.display_name, key: `${dataset?.display_name}-${dataset?.generate_file_name}` diff --git a/app/client/src/pages/Home/EvaluationsTab.tsx b/app/client/src/pages/Home/EvaluationsTab.tsx index 62e8510..36b786f 100644 --- a/app/client/src/pages/Home/EvaluationsTab.tsx +++ b/app/client/src/pages/Home/EvaluationsTab.tsx @@ -65,7 +65,7 @@ const EvaluationsTab: React.FC = () => { } }, [isError]); - const onSearch: SearchProps['onSearch'] = (value, _e, info) => { + const onSearch: SearchProps['onSearch'] = (value: unknown) => { throttle((value: string) => setSearchQuery(value), 500)(value); } diff --git a/app/client/src/pages/Home/hooks.ts b/app/client/src/pages/Home/hooks.ts index b588282..dee238e 100644 --- a/app/client/src/pages/Home/hooks.ts +++ b/app/client/src/pages/Home/hooks.ts @@ -38,7 +38,7 @@ export const useDatasets = () => { } ); if (searchQuery !== null && !isEmpty(searchQuery)) { - const filteredData = data?.datasets.filter((dataset: any) => { + const filteredData = data?.datasets.filter((dataset: unknown) => { return dataset.display_name.toLowerCase().includes(searchQuery.toLowerCase()); }); @@ -71,7 +71,7 @@ export const useEvaluations = () => { } ); if (searchQuery !== null && !isEmpty(searchQuery)) { - const filteredData = data?.evaluations.filter((evaluation: any) => { + const filteredData = data?.evaluations.filter((evaluation: unknown) => { return evaluation.display_name.toLowerCase().includes(searchQuery.toLowerCase()); }); diff --git a/app/client/src/utils/sortutils.ts b/app/client/src/utils/sortutils.ts index 396e14d..1754c7e 100644 --- a/app/client/src/utils/sortutils.ts +++ b/app/client/src/utils/sortutils.ts @@ -1,5 +1,5 @@ type Key = string | number; -type Item = { [x: string]: any }; +type Item = { [x: string]: unknown }; /** * Helper function to use as a comparer when sorting an array of items based on a key in the item. diff --git a/build/shell_scripts/build_client.sh b/build/shell_scripts/build_client.sh index d70a30c..2a0e6bb 100644 --- a/build/shell_scripts/build_client.sh +++ b/build/shell_scripts/build_client.sh @@ -16,6 +16,7 @@ fi # Activate virtual environment - using relative path source .venv/bin/activate +export NODE_OPTIONS=--max-old-space-size=16384 # Build frontend cd "$CLIENT_DIR" rm -rf node_modules/