Skip to content

Updated handling of missing values with LightGBM, and added ability to use (0) as missing value #4695

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Feb 10, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
5e0b4b6
Update LightGbmTrainerBase.cs
mstfbl Jan 23, 2020
2680b19
Update LightGbmTrainerBase.cs
mstfbl Jan 23, 2020
d857b85
Added UseZeroAsMissingValue as a modifiable LightGBM flag
mstfbl Jan 24, 2020
0faa474
Update core_manifest.json
mstfbl Jan 29, 2020
d3274ea
Merge remote-tracking branch 'upstream/master' into Issue-4681
mstfbl Feb 3, 2020
c72ed38
Updated baseline files for LightGBMClassificationTest()
mstfbl Feb 3, 2020
06754f5
Updated baseline files for GossLightGBMTest()
mstfbl Feb 3, 2020
4397725
Updated baseline files for DartLightGBMTest()
mstfbl Feb 3, 2020
3f082e0
Revert "Updated baseline files for DartLightGBMTest()"
mstfbl Feb 5, 2020
046f5b0
Revert "Updated baseline files for GossLightGBMTest()"
mstfbl Feb 5, 2020
9c426e1
Revert "Updated baseline files for LightGBMClassificationTest()"
mstfbl Feb 5, 2020
00aaf35
Update TestPredictors.cs
mstfbl Feb 5, 2020
59bcaca
Revert "Revert "Updated baseline files for LightGBMClassificationTest…
mstfbl Feb 6, 2020
4c0ff8a
Revert "Revert "Updated baseline files for GossLightGBMTest()""
mstfbl Feb 6, 2020
5f5e950
Revert "Revert "Updated baseline files for DartLightGBMTest()""
mstfbl Feb 6, 2020
6f54895
Updated test datasets and LightGbm flag shortnames
mstfbl Feb 6, 2020
0ea84a6
Added test to confirm run-time behavior of LightGBM doesn't change
mstfbl Feb 6, 2020
5f39dc0
Update core_manifest.json
mstfbl Feb 6, 2020
bc69ac6
Added correct baseline for LightGBMPreviousModelBaselineTest()
mstfbl Feb 7, 2020
906c1a5
Added previously trained model at accessible location for all builds
mstfbl Feb 7, 2020
fdf61ba
Update used library for NetFx builds
mstfbl Feb 7, 2020
955ecfc
Merge remote-tracking branch 'upstream/master' into Issue-4681
mstfbl Feb 10, 2020
59ba7fe
Updated LightGBMPreviousModelBaselineTest
mstfbl Feb 10, 2020
5c79905
Update LightGBMPreviousModelBaselineTest
mstfbl Feb 10, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ public static IEnumerable<SweepableParam> BuildLightGbmParams()
new SweepableDiscreteParam("MinimumExampleCountPerLeaf", new object[] { 1, 10, 20, 50 }),
new SweepableDiscreteParam("UseCategoricalSplit", new object[] { true, false }),
new SweepableDiscreteParam("HandleMissingValue", new object[] { true, false }),
new SweepableDiscreteParam("UseZeroAsMissingValue", new object[] { true, false }),
new SweepableDiscreteParam("MinimumExampleCountPerGroup", new object[] { 10, 50, 100, 200 }),
new SweepableDiscreteParam("MaximumCategoricalSplitPointCount", new object[] { 8, 16, 32, 64 }),
new SweepableDiscreteParam("CategoricalSmoothing", new object[] { 1, 10, 20 }),
Expand Down
15 changes: 12 additions & 3 deletions src/Microsoft.ML.LightGbm/LightGbmTrainerBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ public class OptionsBase : TrainerInputBaseWithGroupId
{nameof(MaximumCategoricalSplitPointCount), "max_cat_threshold" },
{nameof(CategoricalSmoothing), "cat_smooth" },
{nameof(L2CategoricalRegularization), "cat_l2" },
{nameof(HandleMissingValue), "use_missing" }
{nameof(HandleMissingValue), "use_missing" },
{nameof(UseZeroAsMissingValue), "zero_as_missing" }
};

private protected string GetOptionName(string name)
Expand Down Expand Up @@ -174,10 +175,17 @@ private protected OptionsBase() { }
/// <summary>
/// Whether to enable special handling of missing value or not.
/// </summary>
[Argument(ArgumentType.AtMostOnce, HelpText = "Enable special handling of missing value or not.")]
[Argument(ArgumentType.AtMostOnce, HelpText = "Enable special handling of missing value or not.", ShortName = "hmv")]
[TlcModule.SweepableDiscreteParam("UseMissing", new object[] { true, false })]
public bool HandleMissingValue = true;

/// <summary>
/// Whether to enable the usage of zero (0) as missing value.
/// </summary>
[Argument(ArgumentType.AtMostOnce, HelpText = "Enable usage of zero (0) as missing value.", ShortName = "uzam")]
[TlcModule.SweepableDiscreteParam("UseZeroAsMissing", new object[] { true, false })]
public bool UseZeroAsMissingValue = false;

/// <summary>
/// The minimum number of data points per categorical group.
/// </summary>
Expand Down Expand Up @@ -259,6 +267,7 @@ internal virtual Dictionary<string, object> ToDictionary(IHost host)

res[GetOptionName(nameof(MaximumBinCountPerFeature))] = MaximumBinCountPerFeature;
res[GetOptionName(nameof(HandleMissingValue))] = HandleMissingValue;
res[GetOptionName(nameof(UseZeroAsMissingValue))] = UseZeroAsMissingValue;
res[GetOptionName(nameof(MinimumExampleCountPerGroup))] = MinimumExampleCountPerGroup;
res[GetOptionName(nameof(MaximumCategoricalSplitPointCount))] = MaximumCategoricalSplitPointCount;
res[GetOptionName(nameof(CategoricalSmoothing))] = CategoricalSmoothing;
Expand Down Expand Up @@ -436,7 +445,7 @@ private protected virtual void GetDefaultParameters(IChannel ch, int numRow, boo

private FloatLabelCursor.Factory CreateCursorFactory(RoleMappedData data)
{
var loadFlags = CursOpt.AllLabels | CursOpt.Features;
var loadFlags = CursOpt.AllLabels | CursOpt.AllFeatures;
if (PredictionKind == PredictionKind.Ranking)
loadFlags |= CursOpt.Group;

Expand Down
88 changes: 88 additions & 0 deletions test/BaselineOutput/Common/EntryPoints/core_manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -11636,6 +11636,9 @@
"Name": "HandleMissingValue",
"Type": "Bool",
"Desc": "Enable special handling of missing value or not.",
"Aliases": [
"hmv"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
Expand All @@ -11648,6 +11651,25 @@
]
}
},
{
"Name": "UseZeroAsMissingValue",
"Type": "Bool",
"Desc": "Enable usage of zero (0) as missing value.",
"Aliases": [
"uzam"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
"Default": false,
"SweepRange": {
"RangeType": "Discrete",
"Values": [
true,
false
]
}
},
{
"Name": "MinimumExampleCountPerGroup",
"Type": "Int",
Expand Down Expand Up @@ -12133,6 +12155,9 @@
"Name": "HandleMissingValue",
"Type": "Bool",
"Desc": "Enable special handling of missing value or not.",
"Aliases": [
"hmv"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
Expand All @@ -12145,6 +12170,25 @@
]
}
},
{
"Name": "UseZeroAsMissingValue",
"Type": "Bool",
"Desc": "Enable usage of zero (0) as missing value.",
"Aliases": [
"uzam"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
"Default": false,
"SweepRange": {
"RangeType": "Discrete",
"Values": [
true,
false
]
}
},
{
"Name": "MinimumExampleCountPerGroup",
"Type": "Int",
Expand Down Expand Up @@ -12630,6 +12674,9 @@
"Name": "HandleMissingValue",
"Type": "Bool",
"Desc": "Enable special handling of missing value or not.",
"Aliases": [
"hmv"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
Expand All @@ -12642,6 +12689,25 @@
]
}
},
{
"Name": "UseZeroAsMissingValue",
"Type": "Bool",
"Desc": "Enable usage of zero (0) as missing value.",
"Aliases": [
"uzam"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
"Default": false,
"SweepRange": {
"RangeType": "Discrete",
"Values": [
true,
false
]
}
},
{
"Name": "MinimumExampleCountPerGroup",
"Type": "Int",
Expand Down Expand Up @@ -13088,6 +13154,9 @@
"Name": "HandleMissingValue",
"Type": "Bool",
"Desc": "Enable special handling of missing value or not.",
"Aliases": [
"hmv"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
Expand All @@ -13100,6 +13169,25 @@
]
}
},
{
"Name": "UseZeroAsMissingValue",
"Type": "Bool",
"Desc": "Enable usage of zero (0) as missing value.",
"Aliases": [
"uzam"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
"Default": false,
"SweepRange": {
"RangeType": "Discrete",
"Values": [
true,
false
]
}
},
{
"Name": "MinimumExampleCountPerGroup",
"Type": "Int",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,42 +12,42 @@ Confusion table
||======================
PREDICTED || positive | negative | Recall
TRUTH ||======================
positive || 128 | 6 | 0.9552
negative || 10 | 218 | 0.9561
positive || 124 | 10 | 0.9254
negative || 9 | 219 | 0.9605
||======================
Precision || 0.9275 | 0.9732 |
OVERALL 0/1 ACCURACY: 0.955801
LOG LOSS/instance: 0.301908
Precision || 0.9323 | 0.9563 |
OVERALL 0/1 ACCURACY: 0.947514
LOG LOSS/instance: 0.303740
Test-set entropy (prior Log-Loss/instance): 0.950799
LOG-LOSS REDUCTION (RIG): 0.682470
AUC: 0.982473
LOG-LOSS REDUCTION (RIG): 0.680543
AUC: 0.984944
TEST POSITIVE RATIO: 0.3175 (107.0/(107.0+230.0))
Confusion table
||======================
PREDICTED || positive | negative | Recall
TRUTH ||======================
positive || 100 | 7 | 0.9346
negative || 8 | 222 | 0.9652
positive || 97 | 10 | 0.9065
negative || 10 | 220 | 0.9565
||======================
Precision || 0.9259 | 0.9694 |
OVERALL 0/1 ACCURACY: 0.955490
LOG LOSS/instance: 0.290926
Precision || 0.9065 | 0.9565 |
OVERALL 0/1 ACCURACY: 0.940653
LOG LOSS/instance: 0.297583
Test-set entropy (prior Log-Loss/instance): 0.901650
LOG-LOSS REDUCTION (RIG): 0.677340
AUC: 0.992076
LOG-LOSS REDUCTION (RIG): 0.669957
AUC: 0.991833

OVERALL RESULTS
---------------------------------------
AUC: 0.987274 (0.0048)
Accuracy: 0.955645 (0.0002)
Positive precision: 0.926731 (0.0008)
Positive recall: 0.944902 (0.0103)
Negative precision: 0.971323 (0.0019)
Negative recall: 0.960679 (0.0045)
Log-loss: 0.296417 (0.0055)
Log-loss reduction: 0.679905 (0.0026)
F1 Score: 0.935705 (0.0055)
AUPRC: 0.969894 (0.0121)
AUC: 0.988388 (0.0034)
Accuracy: 0.944083 (0.0034)
Positive precision: 0.919436 (0.0129)
Positive recall: 0.915958 (0.0094)
Negative precision: 0.956427 (0.0001)
Negative recall: 0.958524 (0.0020)
Log-loss: 0.300661 (0.0031)
Log-loss reduction: 0.675250 (0.0053)
F1 Score: 0.917691 (0.0111)
AUPRC: 0.972137 (0.0107)

---------------------------------------
Physical memory usage(MB): %Number%
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
LightGBM
AUC Accuracy Positive precision Positive recall Negative precision Negative recall Log-loss Log-loss reduction F1 Score AUPRC /iter /lr /nl /mil /booster /nt Learner Name Train Dataset Test Dataset Results File Run Time Physical Memory Virtual Memory Command Line Settings
0.987274 0.955645 0.926731 0.944902 0.971323 0.960679 0.296417 0.679905 0.935705 0.969894 10 0.2 20 10 dart 1 LightGBM %Data% %Output% 99 0 0 maml.exe CV tr=LightGBM{nt=1 iter=10 booster=dart lr=0.2 mil=10 nl=20} threads=- cache=- dout=%Output% loader=Text{sparse- col=Attr:TX:6 col=Label:0 col=Features:1-5,6,7-9} data=%Data% seed=1 /iter:10;/lr:0.2;/nl:20;/mil:10;/booster:dart;/nt:1
0.988388 0.944083 0.919436 0.915958 0.956427 0.958524 0.300661 0.67525 0.917691 0.972137 10 0.2 20 10 dart 1 LightGBM %Data% %Output% 99 0 0 maml.exe CV tr=LightGBM{nt=1 iter=10 booster=dart lr=0.2 mil=10 nl=20} threads=- cache=- dout=%Output% loader=Text{sparse- col=Attr:TX:6 col=Label:0 col=Features:1-5,6,7-9} data=%Data% seed=1 /iter:10;/lr:0.2;/nl:20;/mil:10;/booster:dart;/nt:1

Loading