Skip to content

Commit a228b3e

Browse files
Credit card fraud detection in F# (dotnet#241)
1 parent 479cfca commit a228b3e

File tree

4 files changed

+336
-0
lines changed

4 files changed

+336
-0
lines changed
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
2+
Microsoft Visual Studio Solution File, Format Version 12.00
3+
# Visual Studio 15
4+
VisualStudioVersion = 15.0.28307.106
5+
MinimumVisualStudioVersion = 10.0.40219.1
6+
Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "CreditCardFraudDetection", "CreditCardFraudDetection\CreditCardFraudDetection.fsproj", "{1681B36C-3E8B-4729-916C-EAC1A652B369}"
7+
EndProject
8+
Global
9+
GlobalSection(SolutionConfigurationPlatforms) = preSolution
10+
Debug|Any CPU = Debug|Any CPU
11+
Release|Any CPU = Release|Any CPU
12+
EndGlobalSection
13+
GlobalSection(ProjectConfigurationPlatforms) = postSolution
14+
{1681B36C-3E8B-4729-916C-EAC1A652B369}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15+
{1681B36C-3E8B-4729-916C-EAC1A652B369}.Debug|Any CPU.Build.0 = Debug|Any CPU
16+
{1681B36C-3E8B-4729-916C-EAC1A652B369}.Release|Any CPU.ActiveCfg = Release|Any CPU
17+
{1681B36C-3E8B-4729-916C-EAC1A652B369}.Release|Any CPU.Build.0 = Release|Any CPU
18+
EndGlobalSection
19+
GlobalSection(SolutionProperties) = preSolution
20+
HideSolutionNode = FALSE
21+
EndGlobalSection
22+
GlobalSection(ExtensibilityGlobals) = postSolution
23+
SolutionGuid = {AFA794E8-099D-4DCD-8454-2B746EFD0567}
24+
EndGlobalSection
25+
EndGlobal
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>netcoreapp2.1</TargetFramework>
6+
</PropertyGroup>
7+
8+
<ItemGroup>
9+
<Compile Include="Program.fs" />
10+
</ItemGroup>
11+
12+
<ItemGroup>
13+
<PackageReference Include="Microsoft.ML" Version="0.9.0" />
14+
</ItemGroup>
15+
16+
</Project>
Lines changed: 295 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,295 @@
1+
open System
2+
open System.IO
3+
open System.IO.Compression
4+
5+
open Microsoft.ML
6+
open Microsoft.ML.Data
7+
open Microsoft.ML.Transforms.Normalizers
8+
9+
// Data models
10+
[<CLIMutable>]
11+
type TransactionObservation = {
12+
Label: bool
13+
V1: float32
14+
V2: float32
15+
V3: float32
16+
V4: float32
17+
V5: float32
18+
V6: float32
19+
V7: float32
20+
V8: float32
21+
V9: float32
22+
V10: float32
23+
V11: float32
24+
V12: float32
25+
V13: float32
26+
V14: float32
27+
V15: float32
28+
V16: float32
29+
V17: float32
30+
V18: float32
31+
V19: float32
32+
V20: float32
33+
V21: float32
34+
V22: float32
35+
V23: float32
36+
V24: float32
37+
V25: float32
38+
V26: float32
39+
V27: float32
40+
V28: float32
41+
Amount: float32
42+
}
43+
44+
[<CLIMutable>]
45+
type TransactionFraudPrediction = {
46+
Label: bool
47+
PredictedLabel: bool
48+
Score: float32
49+
Probability: float32
50+
}
51+
52+
[<EntryPoint>]
53+
let main _ =
54+
55+
(*
56+
File names and location
57+
*)
58+
59+
let appDirectory =
60+
Environment.GetCommandLineArgs().[0]
61+
|> Path.GetDirectoryName
62+
63+
let dataDirectory = Path.Combine (appDirectory, "../../../../Data/")
64+
65+
let zippedDatasetFile = Path.Combine (dataDirectory, "creditcardfraud-dataset.zip")
66+
67+
let inputFile = Path.Combine (dataDirectory, "creditcard.csv")
68+
let trainFile = Path.Combine (dataDirectory, "trainData.csv")
69+
let testFile = Path.Combine (dataDirectory, "testData.csv")
70+
71+
let modelFile = Path.Combine (dataDirectory, "fastTree.zip")
72+
73+
(*
74+
Prepare input file from original zipped dataset
75+
*)
76+
77+
if not (File.Exists (inputFile))
78+
then
79+
printfn "Extracting dataset"
80+
ZipFile.ExtractToDirectory (zippedDatasetFile, dataDirectory)
81+
82+
83+
let seed = Nullable 1
84+
let mlContext = MLContext seed
85+
86+
let columns =
87+
[
88+
// A boolean column depicting the 'label'.
89+
TextLoader.Column("Label", Nullable DataKind.BL, 30)
90+
// 29 Features V1..V28 + Amount
91+
TextLoader.Column("V1", Nullable DataKind.R4, 1)
92+
TextLoader.Column("V2", Nullable DataKind.R4, 2)
93+
TextLoader.Column("V3", Nullable DataKind.R4, 3)
94+
TextLoader.Column("V4", Nullable DataKind.R4, 4)
95+
TextLoader.Column("V5", Nullable DataKind.R4, 5)
96+
TextLoader.Column("V6", Nullable DataKind.R4, 6)
97+
TextLoader.Column("V7", Nullable DataKind.R4, 7)
98+
TextLoader.Column("V8", Nullable DataKind.R4, 8)
99+
TextLoader.Column("V9", Nullable DataKind.R4, 9)
100+
TextLoader.Column("V10", Nullable DataKind.R4, 10)
101+
TextLoader.Column("V11", Nullable DataKind.R4, 11)
102+
TextLoader.Column("V12", Nullable DataKind.R4, 12)
103+
TextLoader.Column("V13", Nullable DataKind.R4, 13)
104+
TextLoader.Column("V14", Nullable DataKind.R4, 14)
105+
TextLoader.Column("V15", Nullable DataKind.R4, 15)
106+
TextLoader.Column("V16", Nullable DataKind.R4, 16)
107+
TextLoader.Column("V17", Nullable DataKind.R4, 17)
108+
TextLoader.Column("V18", Nullable DataKind.R4, 18)
109+
TextLoader.Column("V19", Nullable DataKind.R4, 19)
110+
TextLoader.Column("V20", Nullable DataKind.R4, 20)
111+
TextLoader.Column("V21", Nullable DataKind.R4, 21)
112+
TextLoader.Column("V22", Nullable DataKind.R4, 22)
113+
TextLoader.Column("V23", Nullable DataKind.R4, 23)
114+
TextLoader.Column("V24", Nullable DataKind.R4, 24)
115+
TextLoader.Column("V25", Nullable DataKind.R4, 25)
116+
TextLoader.Column("V26", Nullable DataKind.R4, 26)
117+
TextLoader.Column("V27", Nullable DataKind.R4, 27)
118+
TextLoader.Column("V28", Nullable DataKind.R4, 28)
119+
TextLoader.Column("Amount", Nullable DataKind.R4, 29)
120+
]
121+
|> List.toArray
122+
123+
let loaderArgs = TextLoader.Arguments()
124+
loaderArgs.Column <- columns
125+
loaderArgs.HasHeader <- true
126+
loaderArgs.Separator <- ","
127+
128+
let reader = TextLoader (mlContext, loaderArgs)
129+
130+
let classification = BinaryClassificationContext mlContext
131+
132+
(*
133+
Split the data 80:20 into train and test files,
134+
if the files do not exist yet.
135+
*)
136+
137+
if not (File.Exists trainFile && File.Exists testFile)
138+
then
139+
printfn "Preparing train and test data"
140+
141+
let data =
142+
MultiFileSource inputFile
143+
|> reader.Read
144+
145+
let trainData, testData =
146+
classification.TrainTestSplit (data, 0.2)
147+
|> fun x -> x.ToTuple ()
148+
149+
// save test split
150+
use fileStream = File.Create testFile
151+
mlContext.Data.SaveAsText(testData, fileStream, separatorChar = ',', headerRow = true, schema = true)
152+
153+
// save train split
154+
use fileStream = File.Create trainFile
155+
mlContext.Data.SaveAsText(trainData, fileStream, separatorChar = ',', headerRow = true, schema = true)
156+
157+
(*
158+
Read the train and test data from file
159+
*)
160+
161+
// Add the "StratificationColumn" that was added by classification.TrainTestSplit()
162+
// And Label is moved to column 0
163+
let columnsPlus =
164+
[
165+
// A boolean column depicting the 'label'.
166+
TextLoader.Column("Label", Nullable DataKind.BL, 0)
167+
// 30 Features V1..V28 + Amount + StratificationColumn
168+
TextLoader.Column("V1", Nullable DataKind.R4, 1)
169+
TextLoader.Column("V2", Nullable DataKind.R4, 2)
170+
TextLoader.Column("V3", Nullable DataKind.R4, 3)
171+
TextLoader.Column("V4", Nullable DataKind.R4, 4)
172+
TextLoader.Column("V5", Nullable DataKind.R4, 5)
173+
TextLoader.Column("V6", Nullable DataKind.R4, 6)
174+
TextLoader.Column("V7", Nullable DataKind.R4, 7)
175+
TextLoader.Column("V8", Nullable DataKind.R4, 8)
176+
TextLoader.Column("V9", Nullable DataKind.R4, 9)
177+
TextLoader.Column("V10", Nullable DataKind.R4, 10)
178+
TextLoader.Column("V11", Nullable DataKind.R4, 11)
179+
TextLoader.Column("V12", Nullable DataKind.R4, 12)
180+
TextLoader.Column("V13", Nullable DataKind.R4, 13)
181+
TextLoader.Column("V14", Nullable DataKind.R4, 14)
182+
TextLoader.Column("V15", Nullable DataKind.R4, 15)
183+
TextLoader.Column("V16", Nullable DataKind.R4, 16)
184+
TextLoader.Column("V17", Nullable DataKind.R4, 17)
185+
TextLoader.Column("V18", Nullable DataKind.R4, 18)
186+
TextLoader.Column("V19", Nullable DataKind.R4, 19)
187+
TextLoader.Column("V20", Nullable DataKind.R4, 20)
188+
TextLoader.Column("V21", Nullable DataKind.R4, 21)
189+
TextLoader.Column("V22", Nullable DataKind.R4, 22)
190+
TextLoader.Column("V23", Nullable DataKind.R4, 23)
191+
TextLoader.Column("V24", Nullable DataKind.R4, 24)
192+
TextLoader.Column("V25", Nullable DataKind.R4, 25)
193+
TextLoader.Column("V26", Nullable DataKind.R4, 26)
194+
TextLoader.Column("V27", Nullable DataKind.R4, 27)
195+
TextLoader.Column("V28", Nullable DataKind.R4, 28)
196+
TextLoader.Column("Amount", Nullable DataKind.R4, 29)
197+
TextLoader.Column("StratificationColumn", Nullable DataKind.R4, 30)
198+
]
199+
|> List.toArray
200+
201+
let trainData, testData =
202+
203+
printfn "Reading train and test data"
204+
205+
let trainData =
206+
mlContext.Data.ReadFromTextFile(
207+
trainFile,
208+
columnsPlus,
209+
loaderArgs.HasHeader,
210+
loaderArgs.Separator.ToCharArray().[0]
211+
)
212+
213+
let testData =
214+
mlContext.Data.ReadFromTextFile(
215+
testFile,
216+
columnsPlus,
217+
loaderArgs.HasHeader,
218+
loaderArgs.Separator.ToCharArray().[0]
219+
)
220+
221+
trainData, testData
222+
223+
(*
224+
Create a flexible pipeline (composed by a chain of estimators)
225+
for building/traing the model.
226+
*)
227+
228+
let featureColumnNames =
229+
trainData.Schema
230+
|> Seq.map (fun column -> column.Name)
231+
|> Seq.filter (fun name -> name <> "Label")
232+
|> Seq.filter (fun name -> name <> "StratificationColumn")
233+
|> Seq.toArray
234+
235+
let pipeline =
236+
mlContext.Transforms.Concatenate ("Features", featureColumnNames)
237+
|> fun x ->
238+
x.Append (
239+
mlContext.Transforms.Normalize (
240+
"Features",
241+
"FeaturesNormalizedByMeanVar",
242+
NormalizingEstimator.NormalizerMode.MeanVariance
243+
)
244+
)
245+
|> fun x ->
246+
x.Append (
247+
mlContext.BinaryClassification.Trainers.FastTree(
248+
"Label",
249+
"Features",
250+
numLeaves = 20,
251+
numTrees = 100,
252+
minDatapointsInLeaves = 10,
253+
learningRate = 0.2
254+
)
255+
)
256+
257+
printfn "Training model"
258+
let model = pipeline.Fit trainData
259+
260+
let metrics = classification.Evaluate (model.Transform (testData), "Label")
261+
printfn "Accuracy: %.2f" metrics.Accuracy
262+
263+
printfn "Saving model to file"
264+
let _ =
265+
use fs = new FileStream (modelFile, FileMode.Create, FileAccess.Write, FileShare.Write)
266+
mlContext.Model.Save(model, fs)
267+
268+
(*
269+
Read the model and test data from file,
270+
and make predictions
271+
*)
272+
273+
printfn "Reading model and test data"
274+
let modelEvaluator =
275+
use file = File.OpenRead modelFile
276+
mlContext.Model.Load(file)
277+
let predictionEngine = modelEvaluator.CreatePredictionEngine<TransactionObservation, TransactionFraudPrediction>(mlContext)
278+
279+
let testData = mlContext.Data.ReadFromTextFile (testFile, columnsPlus, hasHeader = true, separatorChar = ',')
280+
281+
printfn "Making predictions"
282+
testData.AsEnumerable<TransactionObservation>(mlContext, reuseRowObject = false)
283+
|> Seq.filter (fun x -> x.Label = true)
284+
// use 5 observations from the test data
285+
|> Seq.take 5
286+
|> Seq.iter (fun testData ->
287+
let prediction = predictionEngine.Predict testData
288+
printfn "%A" prediction
289+
printfn "------"
290+
)
291+
292+
printfn "Press Enter to quit"
293+
let _ = Console.ReadKey ()
294+
295+
0 // return an integer exit code

0 commit comments

Comments
 (0)