Skip to content

Commit 47ce2bd

Browse files
authored
Merge pull request #77 from cloudera/dev-templates
Dev templates Lending data Credit card data
2 parents 6b85beb + b6920cc commit 47ce2bd

File tree

9 files changed

+913
-143
lines changed

9 files changed

+913
-143
lines changed

.gitignore

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,13 @@ app/frontend/
7474
app/launch_streamlit.py
7575

7676
# Virtual environment
77-
.venv/
77+
.venv/freeform_data_claude_*
78+
row_data_claude_*
79+
freeform_data_claude_*
80+
housing_example.json
81+
seeds_test.json
82+
test.csv
83+
sample_200x100.csv
84+
Raw_Web_Visit_sample.csv
85+
Raw_Web_Visit_Sample.csv
86+
Raw_Web_Visit_Sample.csv

Raw_Web_Visit_Sample.csv

52 KB
Binary file not shown.

app/core/config.py

Lines changed: 822 additions & 55 deletions
Large diffs are not rendered by default.

app/core/prompt_templates.py

Lines changed: 60 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import pandas as pd
66
import numpy as np
77
from app.models.request_models import Example, Example_eval
8-
from app.core.config import UseCase, Technique, ModelFamily, get_model_family,USE_CASE_CONFIGS, LENDING_DATA_PROMPT
8+
from app.core.config import UseCase, Technique, ModelFamily, get_model_family,USE_CASE_CONFIGS, LENDING_DATA_PROMPT, USE_CASE_CONFIGS_EVALS
99
from app.core.data_loader import DataLoader
1010
from app.core.data_analyser import DataAnalyser
1111
from app.core.summary_formatter import SummaryFormatter
@@ -73,6 +73,8 @@
7373
);
7474
"""
7575

76+
77+
7678
DEFAULT_freeform_TEXT2SQL_PROMPT = """Requirements:
7779
- Each solution must be a working SQL query
7880
- Include explanations where needed
@@ -92,6 +94,43 @@
9294
- "question"
9395
- "solution"""
9496

97+
Default_freeform_lending_data_prompt = """
98+
You need to create profile data for the LendingClub company which specialises in lending various types of loans to urban customers.
99+
100+
101+
You need to generate the data in the same order for the following fields (description of each field is followed after the colon):
102+
103+
loan_amnt: The listed amount of the loan applied for by the borrower. If at some point in time, the credit department reduces the loan amount, then it will be reflected in this value.
104+
term: The number of payments on the loan. Values are in months and can be either 36 months or 60 months.
105+
int_rate: Interest Rate on the loan
106+
installment: The monthly payment owed by the borrower if the loan originates.
107+
grade: LC assigned loan grade (Possible values: A, B, C, D, E, F, G)
108+
sub_grade: LC assigned loan subgrade (Possible sub-values: 1-5 i.e A5)
109+
emp_title: The job title supplied by the Borrower when applying for the loan.
110+
emp_length: Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years.
111+
home_ownership: The home ownership status provided by the borrower during registration or obtained from the credit report. Our values are: RENT, OWN, MORTGAGE, OTHER
112+
annual_inc: The self-reported annual income provided by the borrower during registration.
113+
verification_status: Indicates if income was verified by LC, not verified, or if the income source was verified
114+
issue_d: The month which the loan was funded
115+
loan_status: Current status of the loan
116+
purpose: A category provided by the borrower for the loan request.
117+
title: The loan title provided by the borrower
118+
dti: A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income.
119+
earliest_cr_line: The month the borrower's earliest reported credit line was opened
120+
open_acc: The number of open credit lines in the borrower's credit file.
121+
pub_rec: Number of derogatory public records
122+
revol_bal: Total credit revolving balance
123+
revol_util: Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit.
124+
total_acc: The total number of credit lines currently in the borrower's credit file
125+
initial_list_status: The initial listing status of the loan. Possible values are – W, F
126+
application_type: Indicates whether the loan is an individual application or a joint application with two co-borrowers
127+
mort_acc: Number of mortgage accounts.
128+
pub_rec_bankruptcies: Number of public record bankruptcies
129+
address: The physical address of the person
130+
131+
Ensure PII from examples such as addresses are not used in the generated data to minimize any privacy concerns.
132+
"""
133+
95134
DEFAULT_TEXT2SQL_PROMPT = """Requirements:
96135
- Each solution must be a working SQL query
97136
- Include explanations where needed
@@ -135,9 +174,10 @@ class PromptHandler:
135174
def format_examples(examples: List[Example]) -> str:
136175
"""Format examples as JSON string"""
137176
return [
138-
{"question": example.question, "solution": example.solution}
139-
for example in (examples)
140-
]
177+
{"question": example.question, "solution": example.solution}
178+
for example in (examples)
179+
]
180+
141181
@staticmethod
142182
def format_examples_eval(examples: List[Example_eval]) -> str:
143183
"""Format examples as JSON string"""
@@ -196,17 +236,7 @@ def get_freeform_default_custom_prompt(use_case:UseCase, custom_prompt):
196236
@staticmethod
197237
def get_default_custom_eval_prompt(use_case:UseCase, custom_prompt):
198238
if custom_prompt == None:
199-
if use_case == UseCase.TEXT2SQL:
200-
custom_prompt = DEFAULT_TEXT2SQL_EVAL_PROMPT
201-
202-
return custom_prompt
203-
elif use_case == UseCase.CODE_GENERATION:
204-
custom_prompt = DEFAULT_CODE_GENERATION_EVAL_PROMPT
205-
return custom_prompt
206-
207-
elif use_case == UseCase.CUSTOM:
208-
custom_prompt = " "
209-
return custom_prompt
239+
return USE_CASE_CONFIGS_EVALS[use_case].prompt
210240
else:
211241
return custom_prompt
212242
@staticmethod
@@ -536,7 +566,13 @@ def get_freeform_eval_prompt(model_id: str,
536566
custom_prompt = Optional[str]
537567
) -> str:
538568
custom_prompt_str = PromptHandler.get_default_custom_eval_prompt(use_case, custom_prompt)
539-
examples_str = PromptHandler.get_default_eval_example(use_case, examples)
569+
#examples_str = PromptHandler.get_default_eval_example(use_case, examples)
570+
571+
if examples:
572+
examples_str = PromptHandler.format_examples_eval(examples)
573+
574+
elif examples == [] or examples == None:
575+
examples_str = PromptHandler.format_examples_eval(USE_CASE_CONFIGS_EVALS[use_case].default_examples)
540576

541577
base_prompt = """ You are a brilliant judge on evaluating a set of data with fields and corresponding values
542578
Follow the given instructions to understand the structure of given data and evaluate it based on parameters defined for you."""
@@ -1003,11 +1039,14 @@ def json_serializable(obj):
10031039
examples_str = json.dumps(example_custom, indent=2)
10041040

10051041
else:
1006-
if use_case == UseCase.CODE_GENERATION or use_case == UseCase.TEXT2SQL:
1007-
examples_str = json.dumps(USE_CASE_CONFIGS[use_case].default_examples)
1008-
else:
1009-
examples_str = None
1010-
custom_prompt_default = PromptHandler.get_freeform_default_custom_prompt(use_case, custom_prompt)
1042+
#if use_case == UseCase.CODE_GENERATION or use_case == UseCase.TEXT2SQL or use_case == UseCase.LENDING_DATA:
1043+
examples_str = json.dumps(USE_CASE_CONFIGS[use_case].default_examples)
1044+
1045+
if custom_prompt is None:
1046+
custom_prompt_default = USE_CASE_CONFIGS[use_case].prompt
1047+
else:
1048+
custom_prompt_default = custom_prompt
1049+
#custom_prompt_default = PromptHandler.get_freeform_default_custom_prompt(use_case, custom_prompt)
10111050
schema_str = PromptHandler.get_default_schema(use_case, schema)
10121051
if use_case ==UseCase.TEXT2SQL:
10131052
custom_prompt_str = f"""Using this database schema:

app/main.py

Lines changed: 10 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
from app.services.export_results import Export_Service
4848

4949
from app.core.prompt_templates import PromptBuilder, PromptHandler
50-
from app.core.config import UseCase, USE_CASE_CONFIGS
50+
from app.core.config import UseCase, USE_CASE_CONFIGS, USE_CASE_CONFIGS_EVALS
5151
from app.core.database import DatabaseManager
5252
from app.core.exceptions import APIError, InvalidModelError, ModelHandlerError
5353
from app.services.model_alignment import ModelAlignment
@@ -469,10 +469,6 @@ def _flatten(d: dict, parent_key:str = "", sep:str=".") -> dict:
469469
description = "get json content")
470470
async def get_dataset_size(request: RelativePath):
471471

472-
473-
474-
475-
476472
if not request.path:
477473
return JSONResponse(status_code=400, content={"status": "failed", "error": "path missing"})
478474

@@ -948,15 +944,15 @@ async def customise_prompt(use_case: UseCase):
948944
async def customise_prompt(use_case: UseCase):
949945
"""Allow users to customize prompt. Only part of the prompt which can be customized"""
950946
try:
951-
return PromptHandler.get_freeform_default_custom_prompt(use_case, custom_prompt=None)
947+
return USE_CASE_CONFIGS[use_case].prompt
952948
except Exception as e:
953949
raise HTTPException(status_code=500, detail=str(e))
954950

955951
@app.get("/{use_case}/eval_prompt")
956952
async def customise_prompt(use_case: UseCase):
957953
"""Allow users to customize prompt. Only part of the prompt which can be customized"""
958954
try:
959-
return PromptHandler.get_default_custom_eval_prompt(use_case, custom_prompt=None)
955+
return USE_CASE_CONFIGS_EVALS[use_case].prompt
960956
except Exception as e:
961957
raise HTTPException(status_code=500, detail=str(e))
962958

@@ -1146,16 +1142,8 @@ async def delete_evaluation(file_name: str, file_path: Optional[str] = None):
11461142
@app.get("/use-cases/{use_case}/topics")
11471143
async def get_topics(use_case: UseCase):
11481144
"""Get available topics for a specific use case"""
1149-
uc_topics = {"code_generation": ["Algorithms", "Async Programming",
1150-
"Data Structures", "Database Operations",
1151-
"Python Basics", "Web Development"],
1152-
1153-
"text2sql":["Aggregations", "Basic Queries", "Data Manipulation",
1154-
"Joins", "Subqueries", "Window Functions"],
1155-
"custom": []
1156-
}
1157-
1158-
topics = uc_topics[use_case]
1145+
1146+
topics = USE_CASE_CONFIGS[use_case].topics
11591147

11601148
return {"topics":topics}
11611149

@@ -1168,50 +1156,12 @@ async def get_gen_examples(use_case: UseCase):
11681156

11691157
@app.get("/{use_case}/eval_examples")
11701158
async def get_eval_examples(use_case: UseCase):
1171-
if use_case == UseCase.CODE_GENERATION:
1172-
examples = [
1173-
{
1174-
"score": 3,
1175-
"justification": """The code achieves 3 points by implementing core functionality correctly (1),
1176-
showing generally correct implementation with proper syntax (2),
1177-
and being suitable for professional use with good Python patterns and accurate functionality (3).
1178-
While it demonstrates competent development practices, it lacks the robust error handling
1179-
and type hints needed for point 4, and could benefit from better efficiency optimization and code organization."""
1180-
},
1181-
{
1182-
"score": 4,
1183-
"justification": """
1184-
The code earns 4 points by implementing basic functionality (1), showing correct implementation (2),
1185-
being production-ready (3), and demonstrating high efficiency with Python best practices
1186-
including proper error handling, type hints, and clear documentation (4).
1187-
It exhibits experienced developer qualities with well-structured code and maintainable design, though
1188-
it lacks the comprehensive testing and security considerations needed for a perfect score."""
1189-
}
1190-
]
1191-
elif use_case == UseCase.TEXT2SQL:
1192-
1193-
examples = [ {
1194-
"score": 3,
1195-
"justification": """The query earns 3 points by successfully retrieving basic data (1),
1196-
showing correct logical implementation (2), and being suitable for
1197-
professional use with accurate data retrieval and good SQL pattern understanding (3).
1198-
However, it lacks efficiency optimizations and consistent style conventions needed for
1199-
point 4, using basic JOINs without considering indexing or performance implications.
1200-
While functional, the query would benefit from better organization and efficiency improvements."""
1201-
},
1202-
1203-
{
1204-
"score": 4,
1205-
"justification": """The query merits 4 points by retrieving basic data correctly (1), implementing proper
1206-
logic (2), being production-ready (3), and demonstrating high efficiency with proper
1207-
indexing considerations, well-structured JOINs, and consistent formatting (4). It
1208-
shows experienced developer qualities with appropriate commenting and performant SQL
1209-
features, though it lacks the comprehensive NULL handling and execution plan optimization needed for a
1210-
perfect score."""
1211-
}
1212-
]
1213-
elif use_case ==UseCase.CUSTOM:
1159+
1160+
if use_case ==UseCase.CUSTOM:
12141161
examples = []
1162+
1163+
else:
1164+
examples = USE_CASE_CONFIGS_EVALS[use_case].default_examples
12151165

12161166

12171167
return {"examples": examples}

app/models/request_models.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@
22
import os
33
from pydantic import BaseModel, Field, field_validator, ConfigDict
44
from enum import Enum
5-
from app.core.config import USE_CASE_CONFIGS
5+
from app.core.config import USE_CASE_CONFIGS, UseCase
66

7-
class UseCase(str, Enum):
8-
CODE_GENERATION = "code_generation"
9-
TEXT2SQL = "text2sql"
10-
CUSTOM = "custom"
7+
8+
# class UseCase(str, Enum):
9+
# CODE_GENERATION = "code_generation"
10+
# TEXT2SQL = "text2sql"
11+
# CUSTOM = "custom"
1112

1213
class Technique(str, Enum):
1314
SFT = "sft"

app/run_job.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,10 @@
3434
from app.services.synthesis_service import SynthesisService
3535
import asyncio
3636
import nest_asyncio # Add this import
37+
import json, pandas as pd, numpy as np, os
38+
from app.agents.schema import GenerationPlan
3739

40+
plan = GenerationPlan.model_validate_json(os.environ.get("JOB_PARAMS"))
3841
# Enable nested event loop
3942
nest_asyncio.apply()
4043

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
12
[build-system]
23
requires = ["hatchling"]
34
build-backend = "hatchling.build"

uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)