Merge pull request #77 from cloudera/dev-templates

Khauneesh-AI · web-flow · commit 47ce2bd22b53 · 2025-06-04T19:18:55.000+05:30
Dev templates
Lending data 
Credit card data
diff --git a/.gitignore b/.gitignore
@@ -74,4 +74,13 @@ app/frontend/
 app/launch_streamlit.py
 
 # Virtual environment
-.venv/
+.venv/freeform_data_claude_*
+row_data_claude_*
+freeform_data_claude_*
+housing_example.json
+seeds_test.json
+test.csv
+sample_200x100.csv
+Raw_Web_Visit_sample.csv
+Raw_Web_Visit_Sample.csv
+Raw_Web_Visit_Sample.csv
diff --git a/Raw_Web_Visit_Sample.csv b/Raw_Web_Visit_Sample.csv
diff --git a/app/core/config.py b/app/core/config.py
diff --git a/app/core/prompt_templates.py b/app/core/prompt_templates.py
@@ -5,7 +5,7 @@
 import pandas as pd
 import numpy as np
 from app.models.request_models import Example, Example_eval
-from app.core.config import UseCase, Technique, ModelFamily, get_model_family,USE_CASE_CONFIGS, LENDING_DATA_PROMPT
+from app.core.config import UseCase, Technique, ModelFamily, get_model_family,USE_CASE_CONFIGS, LENDING_DATA_PROMPT, USE_CASE_CONFIGS_EVALS
 from app.core.data_loader import DataLoader
 from app.core.data_analyser import DataAnalyser
 from app.core.summary_formatter import SummaryFormatter
@@ -73,6 +73,8 @@
 );
 """
 
+
+
 DEFAULT_freeform_TEXT2SQL_PROMPT = """Requirements:
 - Each solution must be a working SQL query
 - Include explanations where needed
@@ -92,6 +94,43 @@
     - "question"
     - "solution"""
 
+Default_freeform_lending_data_prompt = """
+ You need to create profile data for the LendingClub company which specialises in lending various types of loans to urban customers.
+ 
+
+You need to generate the data in the same order for the following  fields (description of each field is followed after the colon):
+
+loan_amnt: The listed amount of the loan applied for by the borrower. If at some point in time, the credit department reduces the loan amount, then it will be reflected in this value.
+term: The number of payments on the loan. Values are in months and can be either 36 months or 60 months.
+int_rate: Interest Rate on the loan
+installment: The monthly payment owed by the borrower if the loan originates.
+grade: LC assigned loan grade (Possible values: A, B, C, D, E, F, G)
+sub_grade: LC assigned loan subgrade (Possible sub-values: 1-5 i.e A5)
+emp_title: The job title supplied by the Borrower when applying for the loan.
+emp_length: Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years.
+home_ownership: The home ownership status provided by the borrower during registration or obtained from the credit report. Our values are: RENT, OWN, MORTGAGE, OTHER
+annual_inc: The self-reported annual income provided by the borrower during registration.
+verification_status: Indicates if income was verified by LC, not verified, or if the income source was verified
+issue_d: The month which the loan was funded
+loan_status: Current status of the loan
+purpose: A category provided by the borrower for the loan request.
+title: The loan title provided by the borrower
+dti: A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income.
+earliest_cr_line: The month the borrower's earliest reported credit line was opened
+open_acc: The number of open credit lines in the borrower's credit file.
+pub_rec: Number of derogatory public records
+revol_bal: Total credit revolving balance
+revol_util: Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit.
+total_acc: The total number of credit lines currently in the borrower's credit file
+initial_list_status: The initial listing status of the loan. Possible values are – W, F
+application_type: Indicates whether the loan is an individual application or a joint application with two co-borrowers
+mort_acc: Number of mortgage accounts.
+pub_rec_bankruptcies: Number of public record bankruptcies
+address: The physical address of the person
+
+Ensure PII from examples such as addresses are not used in the generated data to minimize any privacy concerns.
+"""
+
 DEFAULT_TEXT2SQL_PROMPT = """Requirements:
 - Each solution must be a working SQL query
 - Include explanations where needed
@@ -135,9 +174,10 @@ class PromptHandler:
     def format_examples(examples: List[Example]) -> str:
         """Format examples as JSON string"""
         return [
-                        {"question": example.question, "solution": example.solution}
-                        for example in (examples)
-                    ]
+            {"question": example.question, "solution": example.solution}
+            for example in (examples)
+        ]
+
     @staticmethod
     def format_examples_eval(examples: List[Example_eval]) -> str:
         """Format examples as JSON string"""
@@ -196,17 +236,7 @@ def get_freeform_default_custom_prompt(use_case:UseCase, custom_prompt):
     @staticmethod
     def get_default_custom_eval_prompt(use_case:UseCase, custom_prompt):
         if custom_prompt == None:
-            if use_case == UseCase.TEXT2SQL:
-                custom_prompt = DEFAULT_TEXT2SQL_EVAL_PROMPT
-                
-                return custom_prompt
-            elif use_case == UseCase.CODE_GENERATION:
-                custom_prompt = DEFAULT_CODE_GENERATION_EVAL_PROMPT
-                return custom_prompt
-            
-            elif use_case == UseCase.CUSTOM:
-                custom_prompt = " "
-                return custom_prompt
+            return USE_CASE_CONFIGS_EVALS[use_case].prompt
         else:
             return custom_prompt
     @staticmethod
@@ -536,7 +566,13 @@ def get_freeform_eval_prompt(model_id: str,
         custom_prompt = Optional[str]
     ) -> str:
         custom_prompt_str = PromptHandler.get_default_custom_eval_prompt(use_case, custom_prompt)   
-        examples_str = PromptHandler.get_default_eval_example(use_case, examples)
+        #examples_str = PromptHandler.get_default_eval_example(use_case, examples)
+        
+        if examples:
+            examples_str = PromptHandler.format_examples_eval(examples)
+        
+        elif examples == [] or examples == None:
+            examples_str = PromptHandler.format_examples_eval(USE_CASE_CONFIGS_EVALS[use_case].default_examples)
         
         base_prompt = """ You are a brilliant judge on evaluating a set of data with fields and corresponding values
           Follow the given instructions to understand the structure of given data and evaluate it based on parameters defined for you."""
@@ -1003,11 +1039,14 @@ def json_serializable(obj):
                 examples_str = json.dumps(example_custom, indent=2)
         
             else:
-                if use_case == UseCase.CODE_GENERATION or use_case == UseCase.TEXT2SQL:
-                    examples_str = json.dumps(USE_CASE_CONFIGS[use_case].default_examples)
-                else:
-                    examples_str = None        
-        custom_prompt_default = PromptHandler.get_freeform_default_custom_prompt(use_case, custom_prompt)
+                #if use_case == UseCase.CODE_GENERATION or use_case == UseCase.TEXT2SQL or use_case == UseCase.LENDING_DATA:
+                examples_str = json.dumps(USE_CASE_CONFIGS[use_case].default_examples)
+                
+        if custom_prompt is None:
+            custom_prompt_default = USE_CASE_CONFIGS[use_case].prompt
+        else:
+            custom_prompt_default = custom_prompt              
+        #custom_prompt_default = PromptHandler.get_freeform_default_custom_prompt(use_case, custom_prompt)
         schema_str = PromptHandler.get_default_schema(use_case, schema)
         if use_case ==UseCase.TEXT2SQL:
             custom_prompt_str = f"""Using this database schema:
diff --git a/app/main.py b/app/main.py
@@ -47,7 +47,7 @@
 from app.services.export_results import Export_Service
 
 from app.core.prompt_templates import PromptBuilder, PromptHandler
-from app.core.config import UseCase, USE_CASE_CONFIGS
+from app.core.config import UseCase, USE_CASE_CONFIGS, USE_CASE_CONFIGS_EVALS
 from app.core.database import DatabaseManager
 from app.core.exceptions import APIError, InvalidModelError, ModelHandlerError
 from app.services.model_alignment import ModelAlignment
@@ -469,10 +469,6 @@ def _flatten(d: dict, parent_key:str = "", sep:str=".") -> dict:
           description = "get json content")
 async def get_dataset_size(request: RelativePath):
 
-    
-
-
-
     if not request.path:
         return JSONResponse(status_code=400, content={"status": "failed", "error": "path missing"})
 
@@ -948,15 +944,15 @@ async def customise_prompt(use_case: UseCase):
 async def customise_prompt(use_case: UseCase):
     """Allow users to customize prompt. Only part of the prompt which can be customized"""
     try:
-        return PromptHandler.get_freeform_default_custom_prompt(use_case, custom_prompt=None)
+        return USE_CASE_CONFIGS[use_case].prompt
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 
 @app.get("/{use_case}/eval_prompt")
 async def customise_prompt(use_case: UseCase):
     """Allow users to customize prompt. Only part of the prompt which can be customized"""
     try:
-        return PromptHandler.get_default_custom_eval_prompt(use_case, custom_prompt=None)
+        return USE_CASE_CONFIGS_EVALS[use_case].prompt
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 
@@ -1146,16 +1142,8 @@ async def delete_evaluation(file_name: str, file_path: Optional[str] = None):
 @app.get("/use-cases/{use_case}/topics")
 async def get_topics(use_case: UseCase):
     """Get available topics for a specific use case"""
-    uc_topics = {"code_generation": ["Algorithms", "Async Programming", 
-                                     "Data Structures", "Database Operations", 
-                                     "Python Basics", "Web Development"],
-
-        "text2sql":["Aggregations", "Basic Queries", "Data Manipulation", 
-                    "Joins", "Subqueries", "Window Functions"],
-        "custom": []
-    }
-    
-    topics = uc_topics[use_case]
+   
+    topics = USE_CASE_CONFIGS[use_case].topics
 
     return {"topics":topics}
 
@@ -1168,50 +1156,12 @@ async def get_gen_examples(use_case: UseCase):
 
 @app.get("/{use_case}/eval_examples")
 async def get_eval_examples(use_case: UseCase):
-    if use_case == UseCase.CODE_GENERATION:
-        examples = [
-                    {
-        "score": 3,
-        "justification": """The code achieves 3 points by implementing core functionality correctly (1), 
-        showing generally correct implementation with proper syntax (2), 
-        and being suitable for professional use with good Python patterns and accurate functionality (3). 
-        While it demonstrates competent development practices, it lacks the robust error handling 
-        and type hints needed for point 4, and could benefit from better efficiency optimization and code organization."""
-    },
-    {
-        "score": 4,
-        "justification": """
-        The code earns 4 points by implementing basic functionality (1), showing correct implementation (2), 
-        being production-ready (3), and demonstrating high efficiency with Python best practices 
-        including proper error handling, type hints, and clear documentation (4). 
-        It exhibits experienced developer qualities with well-structured code and maintainable design, though 
-        it lacks the comprehensive testing and security considerations needed for a perfect score."""
-    }
-            ]
-    elif use_case == UseCase.TEXT2SQL:
-
-        examples = [ {
-                    "score": 3,
-                    "justification": """The query earns 3 points by successfully retrieving basic data (1), 
-                    showing correct logical implementation (2), and being suitable for
-                    professional use with accurate data retrieval and good SQL pattern understanding (3). 
-                    However, it lacks efficiency optimizations and consistent style conventions needed for
-                    point 4, using basic JOINs without considering indexing or performance implications. 
-                    While functional, the query would benefit from better organization and efficiency improvements."""
-                            },
-
-                    {
-                "score": 4,
-                "justification": """The query merits 4 points by retrieving basic data correctly (1), implementing proper 
-                logic (2), being production-ready (3), and demonstrating high efficiency with proper
-                indexing considerations, well-structured JOINs, and consistent formatting (4). It 
-                shows experienced developer qualities with appropriate commenting and performant SQL 
-                features, though it lacks the comprehensive NULL handling and execution plan optimization needed for a 
-                perfect score."""
-                    }
-                    ]
-    elif use_case ==UseCase.CUSTOM:
+    
+    if use_case ==UseCase.CUSTOM:
         examples = []
+
+    else:
+        examples = USE_CASE_CONFIGS_EVALS[use_case].default_examples
     
     
     return {"examples": examples}
diff --git a/app/models/request_models.py b/app/models/request_models.py
@@ -2,12 +2,13 @@
 import os
 from pydantic import BaseModel, Field, field_validator, ConfigDict
 from enum import Enum
-from app.core.config import USE_CASE_CONFIGS
+from app.core.config import USE_CASE_CONFIGS, UseCase
 
-class UseCase(str, Enum):
-    CODE_GENERATION = "code_generation"
-    TEXT2SQL = "text2sql"
-    CUSTOM = "custom"
+
+# class UseCase(str, Enum):
+#     CODE_GENERATION = "code_generation"
+#     TEXT2SQL = "text2sql"
+#     CUSTOM = "custom"
 
 class Technique(str, Enum):
     SFT = "sft"
diff --git a/app/run_job.py b/app/run_job.py
@@ -34,7 +34,10 @@
 from app.services.synthesis_service import SynthesisService
 import asyncio
 import nest_asyncio  # Add this import
+import json, pandas as pd, numpy as np, os
+from app.agents.schema import GenerationPlan
 
+plan = GenerationPlan.model_validate_json(os.environ.get("JOB_PARAMS"))
 # Enable nested event loop
 nest_asyncio.apply()
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,3 +1,4 @@
+
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+`
`1`	`2`	`[build-system]`
`2`	`3`	`requires = ["hatchling"]`
`3`	`4`	`build-backend = "hatchling.build"`