|
| 1 | +open System |
| 2 | +open System.IO |
| 3 | +open System.IO.Compression |
| 4 | + |
| 5 | +open Microsoft.ML |
| 6 | +open Microsoft.ML.Data |
| 7 | +open Microsoft.ML.Transforms.Normalizers |
| 8 | + |
| 9 | +// Data models |
| 10 | +[<CLIMutable>] |
| 11 | +type TransactionObservation = { |
| 12 | + Label: bool |
| 13 | + V1: float32 |
| 14 | + V2: float32 |
| 15 | + V3: float32 |
| 16 | + V4: float32 |
| 17 | + V5: float32 |
| 18 | + V6: float32 |
| 19 | + V7: float32 |
| 20 | + V8: float32 |
| 21 | + V9: float32 |
| 22 | + V10: float32 |
| 23 | + V11: float32 |
| 24 | + V12: float32 |
| 25 | + V13: float32 |
| 26 | + V14: float32 |
| 27 | + V15: float32 |
| 28 | + V16: float32 |
| 29 | + V17: float32 |
| 30 | + V18: float32 |
| 31 | + V19: float32 |
| 32 | + V20: float32 |
| 33 | + V21: float32 |
| 34 | + V22: float32 |
| 35 | + V23: float32 |
| 36 | + V24: float32 |
| 37 | + V25: float32 |
| 38 | + V26: float32 |
| 39 | + V27: float32 |
| 40 | + V28: float32 |
| 41 | + Amount: float32 |
| 42 | + } |
| 43 | + |
| 44 | +[<CLIMutable>] |
| 45 | +type TransactionFraudPrediction = { |
| 46 | + Label: bool |
| 47 | + PredictedLabel: bool |
| 48 | + Score: float32 |
| 49 | + Probability: float32 |
| 50 | + } |
| 51 | + |
| 52 | +[<EntryPoint>] |
| 53 | +let main _ = |
| 54 | + |
| 55 | + (* |
| 56 | + File names and location |
| 57 | + *) |
| 58 | + |
| 59 | + let appDirectory = |
| 60 | + Environment.GetCommandLineArgs().[0] |
| 61 | + |> Path.GetDirectoryName |
| 62 | + |
| 63 | + let dataDirectory = Path.Combine (appDirectory, "../../../../Data/") |
| 64 | + |
| 65 | + let zippedDatasetFile = Path.Combine (dataDirectory, "creditcardfraud-dataset.zip") |
| 66 | + |
| 67 | + let inputFile = Path.Combine (dataDirectory, "creditcard.csv") |
| 68 | + let trainFile = Path.Combine (dataDirectory, "trainData.csv") |
| 69 | + let testFile = Path.Combine (dataDirectory, "testData.csv") |
| 70 | + |
| 71 | + let modelFile = Path.Combine (dataDirectory, "fastTree.zip") |
| 72 | + |
| 73 | + (* |
| 74 | + Prepare input file from original zipped dataset |
| 75 | + *) |
| 76 | + |
| 77 | + if not (File.Exists (inputFile)) |
| 78 | + then |
| 79 | + printfn "Extracting dataset" |
| 80 | + ZipFile.ExtractToDirectory (zippedDatasetFile, dataDirectory) |
| 81 | + |
| 82 | + |
| 83 | + let seed = Nullable 1 |
| 84 | + let mlContext = MLContext seed |
| 85 | + |
| 86 | + let columns = |
| 87 | + [ |
| 88 | + // A boolean column depicting the 'label'. |
| 89 | + TextLoader.Column("Label", Nullable DataKind.BL, 30) |
| 90 | + // 29 Features V1..V28 + Amount |
| 91 | + TextLoader.Column("V1", Nullable DataKind.R4, 1) |
| 92 | + TextLoader.Column("V2", Nullable DataKind.R4, 2) |
| 93 | + TextLoader.Column("V3", Nullable DataKind.R4, 3) |
| 94 | + TextLoader.Column("V4", Nullable DataKind.R4, 4) |
| 95 | + TextLoader.Column("V5", Nullable DataKind.R4, 5) |
| 96 | + TextLoader.Column("V6", Nullable DataKind.R4, 6) |
| 97 | + TextLoader.Column("V7", Nullable DataKind.R4, 7) |
| 98 | + TextLoader.Column("V8", Nullable DataKind.R4, 8) |
| 99 | + TextLoader.Column("V9", Nullable DataKind.R4, 9) |
| 100 | + TextLoader.Column("V10", Nullable DataKind.R4, 10) |
| 101 | + TextLoader.Column("V11", Nullable DataKind.R4, 11) |
| 102 | + TextLoader.Column("V12", Nullable DataKind.R4, 12) |
| 103 | + TextLoader.Column("V13", Nullable DataKind.R4, 13) |
| 104 | + TextLoader.Column("V14", Nullable DataKind.R4, 14) |
| 105 | + TextLoader.Column("V15", Nullable DataKind.R4, 15) |
| 106 | + TextLoader.Column("V16", Nullable DataKind.R4, 16) |
| 107 | + TextLoader.Column("V17", Nullable DataKind.R4, 17) |
| 108 | + TextLoader.Column("V18", Nullable DataKind.R4, 18) |
| 109 | + TextLoader.Column("V19", Nullable DataKind.R4, 19) |
| 110 | + TextLoader.Column("V20", Nullable DataKind.R4, 20) |
| 111 | + TextLoader.Column("V21", Nullable DataKind.R4, 21) |
| 112 | + TextLoader.Column("V22", Nullable DataKind.R4, 22) |
| 113 | + TextLoader.Column("V23", Nullable DataKind.R4, 23) |
| 114 | + TextLoader.Column("V24", Nullable DataKind.R4, 24) |
| 115 | + TextLoader.Column("V25", Nullable DataKind.R4, 25) |
| 116 | + TextLoader.Column("V26", Nullable DataKind.R4, 26) |
| 117 | + TextLoader.Column("V27", Nullable DataKind.R4, 27) |
| 118 | + TextLoader.Column("V28", Nullable DataKind.R4, 28) |
| 119 | + TextLoader.Column("Amount", Nullable DataKind.R4, 29) |
| 120 | + ] |
| 121 | + |> List.toArray |
| 122 | + |
| 123 | + let loaderArgs = TextLoader.Arguments() |
| 124 | + loaderArgs.Column <- columns |
| 125 | + loaderArgs.HasHeader <- true |
| 126 | + loaderArgs.Separator <- "," |
| 127 | + |
| 128 | + let reader = TextLoader (mlContext, loaderArgs) |
| 129 | + |
| 130 | + let classification = BinaryClassificationContext mlContext |
| 131 | + |
| 132 | + (* |
| 133 | + Split the data 80:20 into train and test files, |
| 134 | + if the files do not exist yet. |
| 135 | + *) |
| 136 | + |
| 137 | + if not (File.Exists trainFile && File.Exists testFile) |
| 138 | + then |
| 139 | + printfn "Preparing train and test data" |
| 140 | + |
| 141 | + let data = |
| 142 | + MultiFileSource inputFile |
| 143 | + |> reader.Read |
| 144 | + |
| 145 | + let trainData, testData = |
| 146 | + classification.TrainTestSplit (data, 0.2) |
| 147 | + |> fun x -> x.ToTuple () |
| 148 | + |
| 149 | + // save test split |
| 150 | + use fileStream = File.Create testFile |
| 151 | + mlContext.Data.SaveAsText(testData, fileStream, separatorChar = ',', headerRow = true, schema = true) |
| 152 | + |
| 153 | + // save train split |
| 154 | + use fileStream = File.Create trainFile |
| 155 | + mlContext.Data.SaveAsText(trainData, fileStream, separatorChar = ',', headerRow = true, schema = true) |
| 156 | + |
| 157 | + (* |
| 158 | + Read the train and test data from file |
| 159 | + *) |
| 160 | + |
| 161 | + // Add the "StratificationColumn" that was added by classification.TrainTestSplit() |
| 162 | + // And Label is moved to column 0 |
| 163 | + let columnsPlus = |
| 164 | + [ |
| 165 | + // A boolean column depicting the 'label'. |
| 166 | + TextLoader.Column("Label", Nullable DataKind.BL, 0) |
| 167 | + // 30 Features V1..V28 + Amount + StratificationColumn |
| 168 | + TextLoader.Column("V1", Nullable DataKind.R4, 1) |
| 169 | + TextLoader.Column("V2", Nullable DataKind.R4, 2) |
| 170 | + TextLoader.Column("V3", Nullable DataKind.R4, 3) |
| 171 | + TextLoader.Column("V4", Nullable DataKind.R4, 4) |
| 172 | + TextLoader.Column("V5", Nullable DataKind.R4, 5) |
| 173 | + TextLoader.Column("V6", Nullable DataKind.R4, 6) |
| 174 | + TextLoader.Column("V7", Nullable DataKind.R4, 7) |
| 175 | + TextLoader.Column("V8", Nullable DataKind.R4, 8) |
| 176 | + TextLoader.Column("V9", Nullable DataKind.R4, 9) |
| 177 | + TextLoader.Column("V10", Nullable DataKind.R4, 10) |
| 178 | + TextLoader.Column("V11", Nullable DataKind.R4, 11) |
| 179 | + TextLoader.Column("V12", Nullable DataKind.R4, 12) |
| 180 | + TextLoader.Column("V13", Nullable DataKind.R4, 13) |
| 181 | + TextLoader.Column("V14", Nullable DataKind.R4, 14) |
| 182 | + TextLoader.Column("V15", Nullable DataKind.R4, 15) |
| 183 | + TextLoader.Column("V16", Nullable DataKind.R4, 16) |
| 184 | + TextLoader.Column("V17", Nullable DataKind.R4, 17) |
| 185 | + TextLoader.Column("V18", Nullable DataKind.R4, 18) |
| 186 | + TextLoader.Column("V19", Nullable DataKind.R4, 19) |
| 187 | + TextLoader.Column("V20", Nullable DataKind.R4, 20) |
| 188 | + TextLoader.Column("V21", Nullable DataKind.R4, 21) |
| 189 | + TextLoader.Column("V22", Nullable DataKind.R4, 22) |
| 190 | + TextLoader.Column("V23", Nullable DataKind.R4, 23) |
| 191 | + TextLoader.Column("V24", Nullable DataKind.R4, 24) |
| 192 | + TextLoader.Column("V25", Nullable DataKind.R4, 25) |
| 193 | + TextLoader.Column("V26", Nullable DataKind.R4, 26) |
| 194 | + TextLoader.Column("V27", Nullable DataKind.R4, 27) |
| 195 | + TextLoader.Column("V28", Nullable DataKind.R4, 28) |
| 196 | + TextLoader.Column("Amount", Nullable DataKind.R4, 29) |
| 197 | + TextLoader.Column("StratificationColumn", Nullable DataKind.R4, 30) |
| 198 | + ] |
| 199 | + |> List.toArray |
| 200 | + |
| 201 | + let trainData, testData = |
| 202 | + |
| 203 | + printfn "Reading train and test data" |
| 204 | + |
| 205 | + let trainData = |
| 206 | + mlContext.Data.ReadFromTextFile( |
| 207 | + trainFile, |
| 208 | + columnsPlus, |
| 209 | + loaderArgs.HasHeader, |
| 210 | + loaderArgs.Separator.ToCharArray().[0] |
| 211 | + ) |
| 212 | + |
| 213 | + let testData = |
| 214 | + mlContext.Data.ReadFromTextFile( |
| 215 | + testFile, |
| 216 | + columnsPlus, |
| 217 | + loaderArgs.HasHeader, |
| 218 | + loaderArgs.Separator.ToCharArray().[0] |
| 219 | + ) |
| 220 | + |
| 221 | + trainData, testData |
| 222 | + |
| 223 | + (* |
| 224 | + Create a flexible pipeline (composed by a chain of estimators) |
| 225 | + for building/traing the model. |
| 226 | + *) |
| 227 | + |
| 228 | + let featureColumnNames = |
| 229 | + trainData.Schema |
| 230 | + |> Seq.map (fun column -> column.Name) |
| 231 | + |> Seq.filter (fun name -> name <> "Label") |
| 232 | + |> Seq.filter (fun name -> name <> "StratificationColumn") |
| 233 | + |> Seq.toArray |
| 234 | + |
| 235 | + let pipeline = |
| 236 | + mlContext.Transforms.Concatenate ("Features", featureColumnNames) |
| 237 | + |> fun x -> |
| 238 | + x.Append ( |
| 239 | + mlContext.Transforms.Normalize ( |
| 240 | + "Features", |
| 241 | + "FeaturesNormalizedByMeanVar", |
| 242 | + NormalizingEstimator.NormalizerMode.MeanVariance |
| 243 | + ) |
| 244 | + ) |
| 245 | + |> fun x -> |
| 246 | + x.Append ( |
| 247 | + mlContext.BinaryClassification.Trainers.FastTree( |
| 248 | + "Label", |
| 249 | + "Features", |
| 250 | + numLeaves = 20, |
| 251 | + numTrees = 100, |
| 252 | + minDatapointsInLeaves = 10, |
| 253 | + learningRate = 0.2 |
| 254 | + ) |
| 255 | + ) |
| 256 | + |
| 257 | + printfn "Training model" |
| 258 | + let model = pipeline.Fit trainData |
| 259 | + |
| 260 | + let metrics = classification.Evaluate (model.Transform (testData), "Label") |
| 261 | + printfn "Accuracy: %.2f" metrics.Accuracy |
| 262 | + |
| 263 | + printfn "Saving model to file" |
| 264 | + let _ = |
| 265 | + use fs = new FileStream (modelFile, FileMode.Create, FileAccess.Write, FileShare.Write) |
| 266 | + mlContext.Model.Save(model, fs) |
| 267 | + |
| 268 | + (* |
| 269 | + Read the model and test data from file, |
| 270 | + and make predictions |
| 271 | + *) |
| 272 | + |
| 273 | + printfn "Reading model and test data" |
| 274 | + let modelEvaluator = |
| 275 | + use file = File.OpenRead modelFile |
| 276 | + mlContext.Model.Load(file) |
| 277 | + let predictionEngine = modelEvaluator.CreatePredictionEngine<TransactionObservation, TransactionFraudPrediction>(mlContext) |
| 278 | + |
| 279 | + let testData = mlContext.Data.ReadFromTextFile (testFile, columnsPlus, hasHeader = true, separatorChar = ',') |
| 280 | + |
| 281 | + printfn "Making predictions" |
| 282 | + testData.AsEnumerable<TransactionObservation>(mlContext, reuseRowObject = false) |
| 283 | + |> Seq.filter (fun x -> x.Label = true) |
| 284 | + // use 5 observations from the test data |
| 285 | + |> Seq.take 5 |
| 286 | + |> Seq.iter (fun testData -> |
| 287 | + let prediction = predictionEngine.Predict testData |
| 288 | + printfn "%A" prediction |
| 289 | + printfn "------" |
| 290 | + ) |
| 291 | + |
| 292 | + printfn "Press Enter to quit" |
| 293 | + let _ = Console.ReadKey () |
| 294 | + |
| 295 | + 0 // return an integer exit code |
0 commit comments